Spaces:
Sleeping
Sleeping
File size: 3,763 Bytes
3818144 cb9ff77 3818144 cb9ff77 3818144 cb9ff77 3818144 cb9ff77 3818144 cb9ff77 3818144 2c4447e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import uuid
import json
from typing import List, Dict, Any, Tuple
from openpyxl import load_workbook
# XLSX to JSON conversion functions
def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]:
data = []
max_col = worksheet.max_column
max_row = worksheet.max_row
for row in worksheet.iter_rows(min_row=1, max_row=max_row, values_only=True):
if any(cell is not None for cell in row):
data.append([str(cell).strip() if cell is not None else "" for cell in row])
table_data = []
paragraph_texts = []
in_table = False
for row in data:
# Count non-empty cells
non_empty = [cell for cell in row if cell.strip()]
if len(non_empty) >= 2 and any(cell.strip().replace('.', '', 1).isdigit() for cell in non_empty):
in_table = True
table_data.append(row)
elif in_table and len(non_empty) >= 2:
table_data.append(row)
else:
paragraph = " ".join(non_empty)
if paragraph:
paragraph_texts.append(paragraph)
# Post-process paragraphs into structured format
paragraphs = [
{
"uid": str(uuid.uuid4()),
"order": i + 1,
"text": text
} for i, text in enumerate(paragraph_texts)
]
return table_data, paragraphs
def xlsx_to_json(file_path) -> Dict[str, Any]:
workbook = load_workbook(file_path, data_only=True)
worksheet = workbook.active
table_data, paragraphs = detect_table_and_paragraphs(worksheet)
json_data = {
"table": {
"uid": str(uuid.uuid4()),
"table": table_data
},
"paragraphs": paragraphs,
"questions": []
}
return json_data
def json_to_jsonl(json_data: Dict[str, Any]) -> str:
return json.dumps(json_data, ensure_ascii=False)
def json_to_markdown(json_data: Dict[str, Any]) -> str:
markdown_content = "## Data Tabel\n\n"
# Convert table to markdown
table = json_data["table"]["table"]
if table:
markdown_content += "| " + " | ".join(table[0]) + " |\n"
markdown_content += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
for row in table[1:]:
markdown_content += "| " + " | ".join(row) + " |\n"
# Add paragraphs
markdown_content += "\n## Konteks/Paragraf\n\n"
for para in json_data["paragraphs"]:
markdown_content += f"{para['order']}. {para['text']}\n\n"
return markdown_content
# Prompt creation function
def create_prompt(table_data: Dict[str, Any], question: str) -> str:
table = table_data["table"]["table"]
table_md = "\n".join(["| " + " | ".join(row) + " |" for row in table])
text_content = "\n".join([p["text"] for p in table_data["paragraphs"]])
prompt = f"""### Instruction
Given a table and a list of texts in the following, answer the question posed using the following six-step process:
1. Step 1: Predict the type of question being asked. Store this prediction in the variable {{question_type}}.
2. Step 2: Extract the relevant strings or numerical values from the provided table or texts. Store them in {{evidence}}.
3. Step 3: If {{question_type}} is Arithmetic, generate an equation in {{equation}}. Otherwise, put N.A..
4. Step 4: Compute the final answer and store in {{answer}}.
5. Step 5: Predict the answer's scale in {{scale}}. One of: none, percent, thousand, million, billion.
6. Step 6: Based on the {{answer}} and {{question_type}}, generate a short and logical recommendation, business insight, or next action. Store it in {{action}}.
### Table
{table_md}
### Text
{text_content}
### Question
{question}
### Answer"""
return prompt |