import uuid import json from typing import List, Dict, Any, Tuple from openpyxl import load_workbook # XLSX to JSON conversion functions def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]: data = [] max_col = worksheet.max_column max_row = worksheet.max_row for row in worksheet.iter_rows(min_row=1, max_row=max_row, values_only=True): if any(cell is not None for cell in row): data.append([str(cell).strip() if cell is not None else "" for cell in row]) table_data = [] paragraph_texts = [] in_table = False for row in data: # Count non-empty cells non_empty = [cell for cell in row if cell.strip()] if len(non_empty) >= 2 and any(cell.strip().replace('.', '', 1).isdigit() for cell in non_empty): in_table = True table_data.append(row) elif in_table and len(non_empty) >= 2: table_data.append(row) else: paragraph = " ".join(non_empty) if paragraph: paragraph_texts.append(paragraph) # Post-process paragraphs into structured format paragraphs = [ { "uid": str(uuid.uuid4()), "order": i + 1, "text": text } for i, text in enumerate(paragraph_texts) ] return table_data, paragraphs def xlsx_to_json(file_path) -> Dict[str, Any]: workbook = load_workbook(file_path, data_only=True) worksheet = workbook.active table_data, paragraphs = detect_table_and_paragraphs(worksheet) json_data = { "table": { "uid": str(uuid.uuid4()), "table": table_data }, "paragraphs": paragraphs, "questions": [] } return json_data def json_to_jsonl(json_data: Dict[str, Any]) -> str: return json.dumps(json_data, ensure_ascii=False) def json_to_markdown(json_data: Dict[str, Any]) -> str: markdown_content = "## Data Tabel\n\n" # Convert table to markdown table = json_data["table"]["table"] if table: markdown_content += "| " + " | ".join(table[0]) + " |\n" markdown_content += "| " + " | ".join(["---"] * len(table[0])) + " |\n" for row in table[1:]: markdown_content += "| " + " | ".join(row) + " |\n" # Add paragraphs markdown_content += "\n## Konteks/Paragraf\n\n" for para in json_data["paragraphs"]: markdown_content += f"{para['order']}. {para['text']}\n\n" return markdown_content # Prompt creation function def create_prompt(table_data: Dict[str, Any], question: str) -> str: table = table_data["table"]["table"] table_md = "\n".join(["| " + " | ".join(row) + " |" for row in table]) text_content = "\n".join([p["text"] for p in table_data["paragraphs"]]) prompt = f"""### Instruction Given a table and a list of texts in the following, answer the question posed using the following six-step process: 1. Step 1: Predict the type of question being asked. Store this prediction in the variable {{question_type}}. 2. Step 2: Extract the relevant strings or numerical values from the provided table or texts. Store them in {{evidence}}. 3. Step 3: If {{question_type}} is Arithmetic, generate an equation in {{equation}}. Otherwise, put N.A.. 4. Step 4: Compute the final answer and store in {{answer}}. 5. Step 5: Predict the answer's scale in {{scale}}. One of: none, percent, thousand, million, billion. 6. Step 6: Based on the {{answer}} and {{question_type}}, generate a short and logical recommendation, business insight, or next action. Store it in {{action}}. ### Table {table_md} ### Text {text_content} ### Question {question} ### Answer""" return prompt