Spaces:

BarudakLearning
/

terTATa

Sleeping

App Files Files Community

terTATa / utils.py

Rifqidits

Change to Bahasa Indonesia

cb9ff77 about 1 month ago

raw

history blame contribute delete

3.76 kB

	import uuid
	import json
	from typing import List, Dict, Any, Tuple
	from openpyxl import load_workbook

	# XLSX to JSON conversion functions
	def detect_table_and_paragraphs(worksheet) -> Tuple[List[List[str]], List[Dict[str, Any]]]:
	data = []
	max_col = worksheet.max_column
	max_row = worksheet.max_row

	for row in worksheet.iter_rows(min_row=1, max_row=max_row, values_only=True):
	if any(cell is not None for cell in row):
	data.append([str(cell).strip() if cell is not None else "" for cell in row])

	table_data = []
	paragraph_texts = []
	in_table = False

	for row in data:
	# Count non-empty cells
	non_empty = [cell for cell in row if cell.strip()]
	if len(non_empty) >= 2 and any(cell.strip().replace('.', '', 1).isdigit() for cell in non_empty):
	in_table = True
	table_data.append(row)
	elif in_table and len(non_empty) >= 2:
	table_data.append(row)
	else:
	paragraph = " ".join(non_empty)
	if paragraph:
	paragraph_texts.append(paragraph)

	# Post-process paragraphs into structured format
	paragraphs = [
	{
	"uid": str(uuid.uuid4()),
	"order": i + 1,
	"text": text
	} for i, text in enumerate(paragraph_texts)
	]

	return table_data, paragraphs

	def xlsx_to_json(file_path) -> Dict[str, Any]:
	workbook = load_workbook(file_path, data_only=True)
	worksheet = workbook.active

	table_data, paragraphs = detect_table_and_paragraphs(worksheet)

	json_data = {
	"table": {
	"uid": str(uuid.uuid4()),
	"table": table_data
	},
	"paragraphs": paragraphs,
	"questions": []
	}

	return json_data

	def json_to_jsonl(json_data: Dict[str, Any]) -> str:
	return json.dumps(json_data, ensure_ascii=False)

	def json_to_markdown(json_data: Dict[str, Any]) -> str:
	markdown_content = "## Data Tabel\n\n"

	# Convert table to markdown
	table = json_data["table"]["table"]
	if table:
	markdown_content += "\| " + " \| ".join(table[0]) + " \|\n"
	markdown_content += "\| " + " \| ".join(["---"] * len(table[0])) + " \|\n"
	for row in table[1:]:
	markdown_content += "\| " + " \| ".join(row) + " \|\n"

	# Add paragraphs
	markdown_content += "\n## Konteks/Paragraf\n\n"
	for para in json_data["paragraphs"]:
	markdown_content += f"{para['order']}. {para['text']}\n\n"

	return markdown_content

	# Prompt creation function
	def create_prompt(table_data: Dict[str, Any], question: str) -> str:
	table = table_data["table"]["table"]
	table_md = "\n".join(["\| " + " \| ".join(row) + " \|" for row in table])

	text_content = "\n".join([p["text"] for p in table_data["paragraphs"]])

	prompt = f"""### Instruction
	Given a table and a list of texts in the following, answer the question posed using the following six-step process:
	1. Step 1: Predict the type of question being asked. Store this prediction in the variable {{question_type}}.
	2. Step 2: Extract the relevant strings or numerical values from the provided table or texts. Store them in {{evidence}}.
	3. Step 3: If {{question_type}} is Arithmetic, generate an equation in {{equation}}. Otherwise, put N.A..
	4. Step 4: Compute the final answer and store in {{answer}}.
	5. Step 5: Predict the answer's scale in {{scale}}. One of: none, percent, thousand, million, billion.
	6. Step 6: Based on the {{answer}} and {{question_type}}, generate a short and logical recommendation, business insight, or next action. Store it in {{action}}.

	### Table
	{table_md}

	### Text
	{text_content}

	### Question
	{question}

	### Answer"""

	return prompt