Spaces:

lynx-analytics
/

lynxkite

Running

App Files Files Community

lynxkite / lynxkite-lynxscribe /src /lynxkite_lynxscribe /lynxscribe_ops.py

darabos

Update OpenAI API compatibility layer.

cf9a22d 4 months ago

raw

history blame contribute delete

34.6 kB

	"""
	LynxScribe configuration and testing in LynxKite.
	TODO: all these outputs should contain metadata. So the next task can check the input type, etc.
	"""

	from google.cloud import storage
	from copy import deepcopy
	from enum import Enum
	import asyncio
	import pandas as pd
	from pydantic import BaseModel, ConfigDict

	import pathlib
	from lynxscribe.core.llm.base import get_llm_engine
	from lynxscribe.core.vector_store.base import get_vector_store
	from lynxscribe.common.config import load_config
	from lynxscribe.components.text.embedder import TextEmbedder
	from lynxscribe.core.models.embedding import Embedding
	from lynxscribe.components.embedding_clustering import FclusterBasedClustering

	from lynxscribe.components.rag.rag_graph import RAGGraph
	from lynxscribe.components.rag.knowledge_base_graph import PandasKnowledgeBaseGraph
	from lynxscribe.components.rag.rag_chatbot import Scenario, ScenarioSelector, RAGChatbot
	from lynxscribe.components.chat.processors import (
	ChatProcessor,
	MaskTemplate,
	TruncateHistory,
	)
	from lynxscribe.components.chat.api import ChatAPI
	from lynxscribe.core.models.prompts import ChatCompletionPrompt, Message
	from lynxscribe.components.rag.loaders import FAQTemplateLoader

	from lynxkite.core import ops
	import json
	from lynxkite.core.executors import one_by_one

	DEFAULT_NEGATIVE_ANSWER = "I'm sorry, but the data I've been trained on does not contain any information related to your question."

	ENV = "LynxScribe"
	one_by_one.register(ENV)
	op = ops.op_registration(ENV)
	output_on_top = ops.output_position(output="top")


	# defining the cloud provider enum
	class CloudProvider(str, Enum):
	GCP = "gcp"
	AWS = "aws"
	AZURE = "azure"


	class RAGVersion(str, Enum):
	V1 = "v1"
	V2 = "v2"


	class MessageRole(str, Enum):
	SYSTEM = "system"
	USER = "user"


	class RAGTemplate(BaseModel):
	"""
	Model for RAG templates consisting of three tables: they are connected via scenario names.
	One table (FAQs) contains scenario-denoted nodes to upsert into the knowledge base, the other
	two tables serve as the configuration for the scenario selector.
	Attributes:
	faq_data:
	Table where each row is an FAQ question, and possibly its answer pair. Will be fed into
	`FAQTemplateLoader.load_nodes_and_edges()`. For configuration of this table see the
	loader's init arguments.
	scenario_data:
	Table where each row is a Scenario, column names are thus scenario attributes. Will be
	fed into `ScenarioSelector.from_data()`.
	prompt_codes:
	Optional helper for the scenario table, may contain prompt code mappings to real prompt
	messages. It's enough then to use the codes instead of the full messages in the
	scenarios table. Will be fed into `ScenarioSelector.from_data()`.
	"""

	model_config = ConfigDict(arbitrary_types_allowed=True)

	faq_data: pd.DataFrame
	scenario_data: pd.DataFrame
	prompt_codes: dict[str, str] = {}

	@classmethod
	def from_excel_path(
	cls,
	path: str,
	faq_data_sheet_name: str,
	scenario_data_sheet_name: str,
	prompt_codes_sheet_name: str \| None = None,
	) -> "RAGTemplate":
	"""Spawn a RAGTemplate from an Excel file containing the two needed (plus one optional) sheets."""

	def transform_codes(prompt_codes: pd.DataFrame) -> dict[str, str]:
	"""Check and transform prompt codes table into a code dictionary."""
	if (len_columns := len(prompt_codes.columns)) != 2:
	raise ValueError(
	f"Prompt codes should contain exactly 2 columns, {len_columns} found."
	)
	return prompt_codes.set_index(prompt_codes.columns[0])[
	prompt_codes.columns[1]
	].to_dict()

	return cls(
	faq_data=pd.read_excel(path, sheet_name=faq_data_sheet_name),
	scenario_data=pd.read_excel(path, sheet_name=scenario_data_sheet_name),
	prompt_codes=transform_codes(pd.read_excel(path, sheet_name=prompt_codes_sheet_name))
	if prompt_codes_sheet_name
	else {},
	)


	@op("Cloud-sourced File Listing")
	def cloud_file_loader(
	*,
	cloud_provider: CloudProvider = CloudProvider.GCP,
	folder_URL: str = "https://storage.googleapis.com/lynxkite_public_data/lynxscribe-images/image-rag-test",
	accepted_file_types: str = ".jpg, .jpeg, .png",
	):
	"""
	Gives back the list of URLs of all the images from a cloud-based folder.
	Currently only supports GCP storage.
	"""
	if folder_URL[-1].endswith("/"):
	folder_URL = folder_URL[:-1]

	accepted_file_types = tuple([t.strip() for t in accepted_file_types.split(",")])

	if cloud_provider == CloudProvider.GCP:
	client = storage.Client()
	url_useful_part = folder_URL.split(".com/")[-1]
	bucket_name = url_useful_part.split("/")[0]
	if bucket_name == url_useful_part:
	prefix = ""
	else:
	prefix = url_useful_part.split(bucket_name + "/")[-1]

	bucket = client.bucket(bucket_name)
	blobs = bucket.list_blobs(prefix=prefix)
	file_urls = [blob.public_url for blob in blobs if blob.name.endswith(accepted_file_types)]
	return {"file_urls": file_urls}
	else:
	raise ValueError(f"Cloud provider '{cloud_provider}' is not supported.")


	# @output_on_top
	# @op("LynxScribe RAG Graph Vector Store", slow=True)
	# def ls_rag_graph(
	# *,
	# name: str = "faiss",
	# num_dimensions: int = 3072,
	# collection_name: str = "lynx",
	# text_embedder_interface: str = "openai",
	# text_embedder_model_name_or_path: str = "text-embedding-3-large",
	# # api_key_name: str = "OPENAI_API_KEY",
	# ):
	# """
	# Returns with a vector store instance.
	# """

	# # getting the text embedder instance
	# llm_params = {"name": text_embedder_interface}
	# # if api_key_name:
	# # llm_params["api_key"] = os.getenv(api_key_name)
	# llm = get_llm_engine(**llm_params)
	# text_embedder = TextEmbedder(llm=llm, model=text_embedder_model_name_or_path)

	# # getting the vector store
	# if name == "chromadb":
	# vector_store = get_vector_store(name=name, collection_name=collection_name)
	# elif name == "faiss":
	# vector_store = get_vector_store(name=name, num_dimensions=num_dimensions)
	# else:
	# raise ValueError(f"Vector store name '{name}' is not supported.")

	# # building up the RAG graph
	# rag_graph = RAGGraph(
	# PandasKnowledgeBaseGraph(vector_store=vector_store, text_embedder=text_embedder)
	# )

	# return {"rag_graph": rag_graph}


	@op("LynxScribe Image Describer", slow=True)
	async def ls_image_describer(
	file_urls,
	*,
	llm_interface: str = "openai",
	llm_visual_model: str = "gpt-4o",
	llm_prompt_path: str = "uploads/image_description_prompts.yaml",
	llm_prompt_name: str = "cot_picture_descriptor",
	# api_key_name: str = "OPENAI_API_KEY",
	):
	"""
	Returns with image descriptions from a list of image URLs.

	TODO: making the inputs more flexible (e.g. accepting file locations, URLs, binaries, etc.).
	the input dictionary should contain some meta info: e.g., what is in the list...
	"""

	# handling inputs
	image_urls = file_urls["file_urls"]

	# loading the LLM
	llm_params = {"name": llm_interface}
	# if api_key_name:
	# llm_params["api_key"] = os.getenv(api_key_name)
	llm = get_llm_engine(**llm_params)

	# preparing the prompts
	prompt_base = load_config(llm_prompt_path)[llm_prompt_name]
	prompt_list = []

	for i in range(len(image_urls)):
	image = image_urls[i]

	_prompt = deepcopy(prompt_base)
	for message in _prompt:
	if isinstance(message["content"], list):
	for _message_part in message["content"]:
	if "image_url" in _message_part:
	_message_part["image_url"] = {"url": image}

	prompt_list.append(_prompt)

	# creating the prompt objects
	ch_prompt_list = [
	ChatCompletionPrompt(model=llm_visual_model, messages=prompt) for prompt in prompt_list
	]

	# get the image descriptions
	tasks = [llm.acreate_completion(completion_prompt=_prompt) for _prompt in ch_prompt_list]
	out_completions = await asyncio.gather(*tasks)
	results = [
	dictionary_corrector(result.choices[0].message.content) for result in out_completions
	]

	# getting the image descriptions (list of dictionaries {image_url: URL, description: description})
	# TODO: some result class could be a better idea (will be developed in LynxScribe)
	image_descriptions = [
	{"image_url": image_urls[i], "description": results[i]} for i in range(len(image_urls))
	]

	return {"image_descriptions": image_descriptions}


	@op("LynxScribe Image RAG Builder", slow=True)
	async def ls_image_rag_builder(
	image_descriptions,
	*,
	vdb_provider_name: str = "faiss",
	vdb_num_dimensions: int = 3072,
	vdb_collection_name: str = "lynx",
	text_embedder_interface: str = "openai",
	text_embedder_model_name_or_path: str = "text-embedding-3-large",
	# api_key_name: str = "OPENAI_API_KEY",
	):
	"""
	Based on image descriptions, and embedding/VDB parameters,
	the function builds up an image RAG graph, where the nodes are the
	descriptions of the images (and of all image objects).

	In a later phase, synthetic questions and "named entities" will also
	be added to the graph.
	"""

	# handling inputs
	image_descriptions = image_descriptions["image_descriptions"]

	# Building up the empty RAG graph

	# a) Define LLM interface and get a text embedder
	llm_params = {"name": text_embedder_interface}
	# if api_key_name:
	# llm_params["api_key"] = os.getenv(api_key_name)
	llm = get_llm_engine(**llm_params)
	text_embedder = TextEmbedder(llm=llm, model=text_embedder_model_name_or_path)

	# b) getting the vector store
	# TODO: vdb_provider_name should be ENUM, and other parameters should appear accordingly
	if vdb_provider_name == "chromadb":
	vector_store = get_vector_store(name=vdb_provider_name, collection_name=vdb_collection_name)
	elif vdb_provider_name == "faiss":
	vector_store = get_vector_store(name=vdb_provider_name, num_dimensions=vdb_num_dimensions)
	else:
	raise ValueError(f"Vector store name '{vdb_provider_name}' is not supported.")

	# c) building up the RAG graph
	rag_graph = RAGGraph(
	PandasKnowledgeBaseGraph(vector_store=vector_store, text_embedder=text_embedder)
	)

	dict_list_df = []
	for image_description_tuple in image_descriptions:
	image_url = image_description_tuple["image_url"]
	image_description = image_description_tuple["description"]

	if "overall description" in image_description:
	dict_list_df.append(
	{
	"image_url": image_url,
	"description": image_description["overall description"],
	"source": "overall description",
	}
	)

	if "details" in image_description:
	for dkey in image_description["details"].keys():
	text = f"The picture's description is: {image_description['overall description']}\n\nThe description of the {dkey} is: {image_description['details'][dkey]}"
	dict_list_df.append(
	{"image_url": image_url, "description": text, "source": "details"}
	)

	pdf_descriptions = pd.DataFrame(dict_list_df)
	pdf_descriptions["embedding_values"] = await text_embedder.acreate_embedding(
	pdf_descriptions["description"].to_list()
	)
	pdf_descriptions["id"] = "im_" + pdf_descriptions.index.astype(str)

	# adding the embeddings to the RAG graph with metadata
	pdf_descriptions["embedding"] = pdf_descriptions.apply(
	lambda row: Embedding(
	id=row["id"],
	value=row["embedding_values"],
	metadata={
	"image_url": row["image_url"],
	"image_part": row["source"],
	"type": "image_description",
	},
	document=row["description"],
	),
	axis=1,
	)
	embedding_list = pdf_descriptions["embedding"].tolist()

	# adding the embeddings to the RAG graph
	rag_graph.kg_base.vector_store.upsert(embedding_list)

	# # saving the RAG graph
	# rag_graph.kg_base.save(image_rag_out_path)

	return {"rag_graph": rag_graph}


	@op("LynxScribe RAG Graph Saver")
	def ls_save_rag_graph(
	rag_graph,
	*,
	image_rag_out_path: str = "image_test_rag_graph.pickle",
	):
	"""
	Saves the RAG graph to a pickle file.
	"""

	# reading inputs
	rag_graph = rag_graph[0]["rag_graph"]

	rag_graph.kg_base.save(image_rag_out_path)
	return None


	@ops.input_position(rag_graph="bottom")
	@op("LynxScribe Image RAG Query")
	async def search_context(rag_graph, text, *, top_k=3):
	"""
	top_k: which results we are showing (TODO: when the image viewer is
	updated w pager, change back to top k)
	"""
	message = text["text"]
	rag_graph = rag_graph[0]["rag_graph"]

	# get all similarities
	emb_similarities = await rag_graph.search_context(
	message, max_results=top_k, unique_metadata_key="image_url"
	)

	# get the image urls, scores and descriptions
	result_list = []

	for emb_sim in emb_similarities:
	image_url = emb_sim.embedding.metadata["image_url"]
	score = emb_sim.score
	description = emb_sim.embedding.document
	result_list.append({"image_url": image_url, "score": score, "description": description})

	real_k = min(top_k, len(result_list) - 1)

	return {"embedding_similarities": [result_list[real_k]]}


	@op("LynxScribe Image Result Viewer", view="image")
	def view_image(embedding_similarities):
	"""
	Plotting the TOP images (from embedding similarities).

	TODO: later on, the user can scroll the images and send feedbacks
	"""
	embedding_similarities = embedding_similarities["embedding_similarities"]
	return embedding_similarities[0]["image_url"]


	@op("LynxScribe Text RAG Loader", slow=True)
	def ls_text_rag_loader(
	file_urls,
	*,
	input_type: RAGVersion = RAGVersion.V1,
	vdb_provider_name: str = "faiss",
	vdb_num_dimensions: int = 3072,
	vdb_collection_name: str = "lynx",
	text_embedder_interface: str = "openai",
	text_embedder_model_name_or_path: str = "text-embedding-3-large",
	# api_key_name: str = "OPENAI_API_KEY",
	):
	"""
	Loading a text-based RAG graph from saved files (getting pandas readable links).
	"""

	# handling inputs
	file_urls = file_urls["file_urls"]

	# getting the text embedder instance
	llm_params = {"name": text_embedder_interface}
	# if api_key_name:
	# llm_params["api_key"] = os.getenv(api_key_name)
	llm = get_llm_engine(**llm_params)
	text_embedder = TextEmbedder(llm=llm, model=text_embedder_model_name_or_path)

	# getting the vector store
	if vdb_provider_name == "chromadb":
	vector_store = get_vector_store(name=vdb_provider_name, collection_name=vdb_collection_name)
	elif vdb_provider_name == "faiss":
	vector_store = get_vector_store(name=vdb_provider_name, num_dimensions=vdb_num_dimensions)
	else:
	raise ValueError(f"Vector store name '{vdb_provider_name}' is not supported.")

	# building up the RAG graph
	rag_graph = RAGGraph(
	PandasKnowledgeBaseGraph(vector_store=vector_store, text_embedder=text_embedder)
	)

	# loading the knowledge base (temporary + TODO: adding v2)
	if input_type == RAGVersion.V1:
	node_file = [f for f in file_urls if "nodes.p" in f][0]
	edge_file = [f for f in file_urls if "edges.p" in f][0]
	tempcluster_file = [f for f in file_urls if "clusters.p" in f][0]
	rag_graph.kg_base.load_v1_knowledge_base(
	nodes_path=node_file,
	edges_path=edge_file,
	template_cluster_path=tempcluster_file,
	)
	elif input_type == RAGVersion.V2:
	raise ValueError("Currently only v1 input type is supported.")
	else:
	raise ValueError(f"Input type '{input_type}' is not supported.")

	return {"rag_graph": rag_graph}


	@op("LynxScribe FAQ to RAG", slow=True)
	async def ls_faq_to_rag(
	*,
	faq_excel_path: str = "",
	vdb_provider_name: str = "faiss",
	vdb_num_dimensions: int = 3072,
	vdb_collection_name: str = "lynx",
	text_embedder_interface: str = "openai",
	text_embedder_model_name_or_path: str = "text-embedding-3-large",
	scenario_cluster_distance_pct: int = 30,
	):
	"""
	Loading a text-based RAG graph from saved files (getting pandas readable links).
	"""

	# getting the text embedder instance
	llm_params = {"name": text_embedder_interface}
	llm = get_llm_engine(**llm_params)
	text_embedder = TextEmbedder(llm=llm, model=text_embedder_model_name_or_path)

	# getting the vector store
	if vdb_provider_name == "chromadb":
	vector_store = get_vector_store(name=vdb_provider_name, collection_name=vdb_collection_name)
	elif vdb_provider_name == "faiss":
	vector_store = get_vector_store(name=vdb_provider_name, num_dimensions=vdb_num_dimensions)
	else:
	raise ValueError(f"Vector store name '{vdb_provider_name}' is not supported.")

	# building up the RAG graph
	rag_graph = RAGGraph(
	PandasKnowledgeBaseGraph(vector_store=vector_store, text_embedder=text_embedder)
	)

	# loading the knowledge base from the FAQ file
	rag_template = RAGTemplate.from_excel_path(
	path=faq_excel_path,
	faq_data_sheet_name="scenario_examples",
	scenario_data_sheet_name="scenario_scripts",
	prompt_codes_sheet_name="prompt_dictionary",
	)

	faq_loader_params = {
	"id_column": "scenario_example_ID",
	"timestamp_column": "last_modified_timestamp",
	"validity_column": "valid_flg",
	"question_type_contents_id": ["faq_question", "faq_question", "q_{id}"],
	"answer_type_contents_id": ["faq_answer", "{faq_question}\n\n{faq_answer}", "a_{id}"],
	"question_to_answer_edge_type_weight": ["qna", 1.0],
	}

	nodes, edges = FAQTemplateLoader(**faq_loader_params).load_nodes_and_edges(
	rag_template.faq_data
	)

	await rag_graph.kg_base.upsert_nodes(*nodes)
	rag_graph.kg_base.upsert_edges(edges)

	# Generating scenario clusters
	question_ids = [_id for _id in nodes[0] if _id.startswith("q_")]
	stored_embeddings = rag_graph.kg_base.vector_store.get(
	question_ids, include=["embeddings", "metadatas"]
	)
	embedding_vals = pd.Series([_emb.value for _emb in stored_embeddings], index=question_ids)
	labels = pd.Series(
	[_emb.metadata["scenario_name"] for _emb in stored_embeddings], index=question_ids
	)
	temp_cls = FclusterBasedClustering(distance_percentile=scenario_cluster_distance_pct)
	temp_cls.fit(embedding_vals, labels)
	df_tempclusters = temp_cls.get_cluster_centers()

	# Adding the scenario clusters to the RAG Graph
	df_tempclusters["template_id"] = "t_" + df_tempclusters.index.astype(str)
	df_tempclusters["embedding"] = df_tempclusters.apply(
	lambda row: Embedding(
	id=row["template_id"],
	value=row["cluster_center"],
	metadata={"scenario_name": row["control_label"], "type": "intent_cluster"},
	),
	axis=1,
	)
	embedding_list = df_tempclusters["embedding"].tolist()
	rag_graph.kg_base.vector_store.upsert(embedding_list)

	return {"rag_graph": rag_graph}


	@output_on_top
	@op("LynxScribe RAG Graph Chatbot Builder")
	def ls_rag_chatbot_builder(
	rag_graph,
	*,
	scenario_file: str = "uploads/lynx_chatbot_scenario_selector.yaml",
	node_types: str = "intent_cluster",
	scenario_meta_name: str = "",
	):
	"""
	Builds up a RAG Graph-based chatbot (basically the loaded RAG graph +
	a scenario selector).

	TODO: Later, the scenario selector can be built up synthetically from
	the input documents - or semi-automated, not just from the scenario
	yaml.
	"""

	scenarios = load_config(scenario_file)
	node_types = [t.strip() for t in node_types.split(",")]

	# handling inputs
	rag_graph = rag_graph["rag_graph"]

	parameters = {
	"scenarios": [Scenario(**scenario) for scenario in scenarios],
	"node_types": node_types,
	}
	if len(scenario_meta_name) > 0:
	parameters["get_scenario_name"] = lambda node: node.metadata[scenario_meta_name]

	# loading the scenarios
	scenario_selector = ScenarioSelector(**parameters)

	# TODO: later we should unify this "knowledge base" object across the functions
	# this could be always an input of a RAG Chatbot, but also for other apps.
	return {
	"knowledge_base": {
	"rag_graph": rag_graph,
	"scenario_selector": scenario_selector,
	}
	}


	@output_on_top
	@ops.input_position(knowledge_base="bottom", chat_processor="bottom")
	@op("LynxScribe RAG Graph Chatbot Backend")
	def ls_rag_chatbot_backend(
	knowledge_base,
	chat_processor,
	*,
	negative_answer=DEFAULT_NEGATIVE_ANSWER,
	retriever_limits_by_type="{}",
	retriever_strict_limits=True,
	retriever_overall_chunk_limit=20,
	retriever_overall_token_limit=3000,
	retriever_max_iterations=3,
	llm_interface: str = "openai",
	llm_model_name: str = "gpt-4o",
	# api_key_name: str = "OPENAI_API_KEY",
	):
	"""
	Returns with a chatbot instance.
	"""

	# handling_inputs
	rag_graph = knowledge_base[0]["knowledge_base"]["rag_graph"]
	scenario_selector = knowledge_base[0]["knowledge_base"]["scenario_selector"]
	chat_processor = chat_processor[0]["chat_processor"]
	limits_by_type = json.loads(retriever_limits_by_type)

	# connecting to the LLM
	llm_params = {"name": llm_interface}
	# if api_key_name:
	# llm_params["api_key"] = os.getenv(api_key_name)
	llm = get_llm_engine(**llm_params)

	# setting the parameters
	params = {
	"limits_by_type": limits_by_type,
	"strict_limits": retriever_strict_limits,
	"max_results": retriever_overall_chunk_limit,
	"token_limit": retriever_overall_token_limit,
	"max_iterations": retriever_max_iterations,
	}

	# generating the RAG Chatbot
	rag_chatbot = RAGChatbot(
	rag_graph=rag_graph,
	scenario_selector=scenario_selector,
	llm=llm,
	negative_answer=negative_answer,
	**params,
	)

	# generating the chatbot back-end
	c = ChatAPI(
	chatbot=rag_chatbot,
	chat_processor=chat_processor,
	model=llm_model_name,
	)

	return {"chat_api": c}


	@output_on_top
	@ops.input_position(processor="bottom")
	@op("Chat processor")
	def chat_processor(processor, *, _ctx: one_by_one.Context):
	cfg = _ctx.last_result or {
	"question_processors": [],
	"answer_processors": [],
	"masks": [],
	}
	for f in ["question_processor", "answer_processor", "mask"]:
	if f in processor:
	cfg[f + "s"].append(processor[f])
	question_processors = cfg["question_processors"][:]
	answer_processors = cfg["answer_processors"][:]
	masking_templates = {}
	for mask in cfg["masks"]:
	masking_templates[mask["name"]] = mask
	if masking_templates:
	question_processors.append(MaskTemplate(masking_templates=masking_templates))
	answer_processors.append(MaskTemplate(masking_templates=masking_templates))
	chat_processor = ChatProcessor(
	question_processors=question_processors, answer_processors=answer_processors
	)
	return {"chat_processor": chat_processor, **cfg}


	@output_on_top
	@op("LynxScribe Message")
	def lynxscribe_message(
	*, prompt_role: MessageRole = MessageRole.SYSTEM, prompt_content: ops.LongStr
	):
	return_message = Message(role=prompt_role.value, content=prompt_content.strip())
	return {"prompt_message": return_message}


	@op("Read Excel")
	def read_excel(*, file_path: str, sheet_name: str = "Sheet1", columns: str = ""):
	"""
	Reads an Excel file and returns the content of the specified sheet.
	The columns parameter can be used to specify which columns to include in the output.
	If not specified, all columns will be included (separate the values by comma).

	TODO: more general: several input/output versions.
	"""
	df = pd.read_excel(file_path, sheet_name=sheet_name)
	if columns:
	columns = [c.strip() for c in columns.split(",") if c.strip()]
	columns = [c for c in columns if c in df.columns]
	if len(columns) == 0:
	raise ValueError("No valid columns specified.")
	df = df[columns].copy()
	return {"dataframe": df}


	@ops.input_position(system_prompt="bottom", instruction_prompt="bottom", dataframe="left")
	@op("LynxScribe Task Solver", slow=True)
	async def ls_task_solver(
	system_prompt,
	instruction_prompt,
	dataframe,
	*,
	llm_interface: str = "openai",
	llm_model_name: str = "gpt-4o",
	new_column_names: str = "processed_field",
	# api_key_name: str = "OPENAI_API_KEY",
	):
	"""
	Solving the described task on a data frame and put the results into a new column.

	If there are multiple new_column_names provided, the structured dictionary output
	will be split into multiple columns.
	"""

	# handling inputs
	system_message = system_prompt[0]["prompt_message"]
	instruction_message = instruction_prompt[0]["prompt_message"]
	df = dataframe["dataframe"]

	# preparing output
	out_df = df.copy()

	# connecting to the LLM
	llm_params = {"name": llm_interface}
	# if api_key_name:
	# llm_params["api_key"] = os.getenv(api_key_name)
	llm = get_llm_engine(**llm_params)

	# getting the list of fieldnames used in the instruction message
	fieldnames = []
	for pot_fieldname in df.columns:
	if "{" + pot_fieldname + "}" in instruction_message.content:
	fieldnames.append(pot_fieldname)

	# generate a list of instruction messages (from fieldnames)
	# each row of the df is a separate instruction message
	# TODO: make it fast for large dataframes
	instruction_messages = []
	for i in range(len(df)):
	instruction_message_i = deepcopy(instruction_message)
	for fieldname in fieldnames:
	instruction_message_i.content = instruction_message_i.content.replace(
	"{" + fieldname + "}", str(df.iloc[i][fieldname])
	)
	instruction_messages.append(instruction_message_i)

	# generate completition prompt
	completion_prompts = [
	ChatCompletionPrompt(
	model=llm_model_name,
	messages=[system_message, instruction_message_j],
	)
	for instruction_message_j in instruction_messages
	]

	# get the answers
	tasks = [llm.acreate_completion(completion_prompt=_prompt) for _prompt in completion_prompts]
	out_completions = await asyncio.gather(*tasks)

	# answer post-processing: 1 vs more columns
	col_list = [_c.strip() for _c in new_column_names.split(",") if _c.strip()]
	if len(col_list) == 0:
	raise ValueError("No valid column names specified.")
	elif len(col_list) == 1:
	out_df[col_list[0]] = [result.choices[0].message.content for result in out_completions]
	else:
	answers = [
	dictionary_corrector(result.choices[0].message.content, expected_keys=col_list)
	for result in out_completions
	]
	for i, col in enumerate(col_list):
	out_df[col] = [answer[col] for answer in answers]

	return {"dataframe": out_df}


	@output_on_top
	@op("Truncate history")
	def truncate_history(*, max_tokens=10000):
	return {"question_processor": TruncateHistory(max_tokens=max_tokens)}


	@output_on_top
	@op("Mask")
	def mask(*, name="", regex="", exceptions="", mask_pattern=""):
	exceptions = [e.strip() for e in exceptions.split(",") if e.strip()]
	return {
	"mask": {
	"name": name,
	"regex": regex,
	"exceptions": exceptions,
	"mask_pattern": mask_pattern,
	}
	}


	@ops.input_position(chat_api="bottom")
	@op("Test Chat API", slow=True)
	async def test_chat_api(message, chat_api, *, show_details=False):
	chat_api = chat_api[0]["chat_api"]
	request = ChatCompletionPrompt(
	model="",
	messages=[{"role": "user", "content": message["text"]}],
	)
	response = await chat_api.answer(request, stream=False)
	if len(response.choices) == 0:
	answer = "The following FAQ items are similar to the question:\n"
	for item in response.sources:
	answer += f"------------------------------------------------------ \n{item.body}\n\n"
	else:
	answer = response.choices[0].message.content
	if show_details:
	return {"answer": answer, **response.__dict__}
	else:
	return {"answer": answer}


	@op("Input chat")
	def input_chat(*, chat: str):
	return {"text": chat}


	@ops.input_position(input="bottom")
	@op("View DataFrame", view="table_view")
	def view_df(input):
	df = input[0]["dataframe"]
	v = {
	"dataframes": {
	"df": {
	"columns": [str(c) for c in df.columns],
	"data": df.values.tolist(),
	}
	}
	}
	return v


	@op("View", view="table_view")
	def view(input):
	columns = [str(c) for c in input.keys() if not str(c).startswith("_")]
	v = {
	"dataframes": {
	"df": {
	"columns": columns,
	"data": [[input[c] for c in columns]],
	}
	}
	}
	return v


	async def get_chat_api(ws: str):
	from lynxkite.core import workspace

	cwd = pathlib.Path()
	path = cwd / (ws + ".lynxkite.json")
	assert path.is_relative_to(cwd), f"Path '{path}' is invalid"
	assert path.exists(), f"Workspace {path} does not exist"
	ws = workspace.Workspace.load(path)
	# Remove any test nodes.
	ws.nodes = [op for op in ws.nodes if op.data.title != "Test Chat API"]
	ws.normalize()
	executor = ops.EXECUTORS[ENV]
	contexts = await executor(ws)
	nodes = [op for op in ws.nodes if op.data.title == "LynxScribe RAG Graph Chatbot Backend"]
	[node] = nodes
	context = contexts[node.id]
	return context.last_result["chat_api"]


	async def stream_chat_api_response(request):
	chat_api = await get_chat_api(request["model"])
	request = ChatCompletionPrompt(**request)
	async for chunk in await chat_api.answer(request, stream=True):
	chunk.sources = []
	yield chunk.model_dump_json()


	async def get_chat_api_response(request):
	chat_api = await get_chat_api(request["model"])
	request = ChatCompletionPrompt(**request)
	response = await chat_api.answer(request, stream=False)
	response.sources = []
	return response.model_dump_json()


	async def api_service_post(request):
	"""
	Serves a chat endpoint that matches LynxScribe's interface.
	To access it you need to add the "module" and "workspace"
	parameters.
	The workspace must contain exactly one "Chat API" node.

	curl -X POST ${LYNXKITE_URL}/api/service/server.lynxkite_ops \
	-H "Content-Type: application/json" \
	-d '{
	"model": "LynxScribe demo",
	"messages": [{"role": "user", "content": "what does the fox say"}]
	}'
	"""
	path = "/".join(request.url.path.split("/")[4:])
	request = await request.json()
	if path == "chat/completions":
	if request["stream"]:
	from sse_starlette.sse import EventSourceResponse

	return EventSourceResponse(stream_chat_api_response(request))
	else:
	return await get_chat_api_response(request)
	return {"error": "Not found"}


	async def api_service_get(request):
	path = "/".join(request.url.path.split("/")[4:])
	if path == "models":
	return {
	"object": "list",
	"data": [
	{
	"id": ws.removesuffix(".lynxkite.json"),
	"object": "model",
	"created": 0,
	"owned_by": "lynxkite",
	"meta": {"profile_image_url": "https://lynxkite.com/favicon.png"},
	}
	for ws in get_lynxscribe_workspaces()
	],
	}
	return {"error": "Not found"}


	def get_lynxscribe_workspaces() -> list[str]:
	from lynxkite.core import workspace

	workspaces = []
	for p in pathlib.Path().glob("*/"):
	if p.is_file():
	try:
	ws = workspace.Workspace.load(p)
	if ws.env == ENV:
	workspaces.append(str(p))
	except Exception:
	pass # Ignore files that are not valid workspaces.
	workspaces.sort()
	return workspaces


	def dictionary_corrector(dict_string: str, expected_keys: list \| None = None) -> dict:
	"""
	Processing LLM outputs: when the LLM returns with a dictionary (in a string format). It optionally
	crosschecks the input with the expected keys and return a dictionary with the expected keys and their
	values ('unknown' if not present). If there is an error during the processing, it will return with
	a dictionary of the expected keys, all with 'error' as a value (or with an empty dictionary).

	Currently the function does not delete the extra key-value pairs.
	"""

	out_dict = {}

	if len(dict_string) == 0:
	return out_dict

	# deleting the optional text before the first and after the last curly brackets
	dstring_prc = dict_string
	if dstring_prc[0] != "{":
	dstring_prc = "{" + "{".join(dstring_prc.split("{")[1:])
	if dstring_prc[-1] != "}":
	dstring_prc = "}".join(dstring_prc.split("}")[:-1]) + "}"

	try:
	trf_dict = json.loads(dstring_prc)
	if expected_keys:
	for _key in expected_keys:
	if _key in trf_dict:
	out_dict[_key] = trf_dict[_key]
	else:
	out_dict[_key] = "unknown"
	else:
	out_dict = trf_dict
	except Exception:
	if expected_keys:
	for _key in expected_keys:
	out_dict[_key] = "error"

	return out_dict