Spaces:
Sleeping
Sleeping
import uuid | |
import json | |
from typing import List, Dict, Any, Tuple | |
from openpyxl import load_workbook | |
# XLSX to JSON conversion functions | |
def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]: | |
data = [] | |
max_col = worksheet.max_column | |
max_row = worksheet.max_row | |
for row in worksheet.iter_rows(min_row=1, max_row=max_row, values_only=True): | |
if any(cell is not None for cell in row): | |
data.append([str(cell).strip() if cell is not None else "" for cell in row]) | |
table_data = [] | |
paragraph_texts = [] | |
in_table = False | |
for row in data: | |
# Count non-empty cells | |
non_empty = [cell for cell in row if cell.strip()] | |
if len(non_empty) >= 2 and any(cell.strip().replace('.', '', 1).isdigit() for cell in non_empty): | |
in_table = True | |
table_data.append(row) | |
elif in_table and len(non_empty) >= 2: | |
table_data.append(row) | |
else: | |
paragraph = " ".join(non_empty) | |
if paragraph: | |
paragraph_texts.append(paragraph) | |
# Post-process paragraphs into structured format | |
paragraphs = [ | |
{ | |
"uid": str(uuid.uuid4()), | |
"order": i + 1, | |
"text": text | |
} for i, text in enumerate(paragraph_texts) | |
] | |
return table_data, paragraphs | |
def xlsx_to_json(file_path) -> Dict[str, Any]: | |
workbook = load_workbook(file_path, data_only=True) | |
worksheet = workbook.active | |
table_data, paragraphs = detect_table_and_paragraphs(worksheet) | |
json_data = { | |
"table": { | |
"uid": str(uuid.uuid4()), | |
"table": table_data | |
}, | |
"paragraphs": paragraphs, | |
"questions": [] | |
} | |
return json_data | |
def json_to_jsonl(json_data: Dict[str, Any]) -> str: | |
return json.dumps(json_data, ensure_ascii=False) | |
def json_to_markdown(json_data: Dict[str, Any]) -> str: | |
markdown_content = "## Data Tabel\n\n" | |
# Convert table to markdown | |
table = json_data["table"]["table"] | |
if table: | |
markdown_content += "| " + " | ".join(table[0]) + " |\n" | |
markdown_content += "| " + " | ".join(["---"] * len(table[0])) + " |\n" | |
for row in table[1:]: | |
markdown_content += "| " + " | ".join(row) + " |\n" | |
# Add paragraphs | |
markdown_content += "\n## Konteks/Paragraf\n\n" | |
for para in json_data["paragraphs"]: | |
markdown_content += f"{para['order']}. {para['text']}\n\n" | |
return markdown_content | |
# Prompt creation function | |
def create_prompt(table_data: Dict[str, Any], question: str) -> str: | |
table = table_data["table"]["table"] | |
table_md = "\n".join(["| " + " | ".join(row) + " |" for row in table]) | |
text_content = "\n".join([p["text"] for p in table_data["paragraphs"]]) | |
prompt = f"""### Instruction | |
Given a table and a list of texts in the following, answer the question posed using the following six-step process: | |
1. Step 1: Predict the type of question being asked. Store this prediction in the variable {{question_type}}. | |
2. Step 2: Extract the relevant strings or numerical values from the provided table or texts. Store them in {{evidence}}. | |
3. Step 3: If {{question_type}} is Arithmetic, generate an equation in {{equation}}. Otherwise, put N.A.. | |
4. Step 4: Compute the final answer and store in {{answer}}. | |
5. Step 5: Predict the answer's scale in {{scale}}. One of: none, percent, thousand, million, billion. | |
6. Step 6: Based on the {{answer}} and {{question_type}}, generate a short and logical recommendation, business insight, or next action. Store it in {{action}}. | |
### Table | |
{table_md} | |
### Text | |
{text_content} | |
### Question | |
{question} | |
### Answer""" | |
return prompt |