Spaces:

hyperdemocracy
/

legisqa-local

Sleeping

+"""Display components for LegisQA"""
+import streamlit as st
+from legisqa_local.utils.text import escape_markdown, replace_legis_ids_with_urls
+from legisqa_local.utils.usage import display_api_usage
+from legisqa_local.utils.formatting import render_retrieved_chunks
+from legisqa_local.config.models import PROVIDER_MODELS
+def render_example_queries():
+    """Render example queries in an expander"""
+    with st.expander("Example Queries"):
+        st.write(
+            """
+```
+What are the themes around artificial intelligence?
+```
+```
+Write a well cited 3 paragraph essay on food insecurity.
+```
+```
+Create a table summarizing major climate change ideas with columns legis_id, title, idea.
+```
+```
+Write an action plan to keep social security solvent.
+```
+```
+Suggest reforms that would benefit the Medicaid program.
+```
+        """
+        )
+def render_response(
+    response: dict,
+    model_info: dict,
+    provider: str,
+    should_escape_markdown: bool,
+    should_add_legis_urls: bool,
+    tag: str | None = None,
+):
+    """Render a RAG response with usage information and retrieved chunks"""
+    response_text = response["aimessage"].content
+    if should_escape_markdown:
+        response_text = escape_markdown(response_text)
+    if should_add_legis_urls:
+        response_text = replace_legis_ids_with_urls(response_text)
+    with st.container(border=True):
+        if tag is None:
+            st.write("Response")
+        else:
+            st.write(f"Response ({tag})")
+        st.info(response_text)
+    display_api_usage(response["aimessage"], model_info, provider, tag=tag)
+    render_retrieved_chunks(response["docs"], tag=tag)

src/legisqa_local/components/forms.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Form components for configuration in LegisQA"""
+import streamlit as st
+from legisqa_local.config.models import PROVIDER_MODELS, CONGRESS_NUMBERS, SPONSOR_PARTIES
+def get_generative_config(key_prefix: str) -> dict:
+    """Render generative model configuration form"""
+    output = {}
+    key = "provider"
+    output[key] = st.selectbox(
+        label=key, options=PROVIDER_MODELS.keys(), key=f"{key_prefix}|{key}"
+    )
+    key = "model_name"
+    output[key] = st.selectbox(
+        label=key,
+        options=PROVIDER_MODELS[output["provider"]],
+        key=f"{key_prefix}|{key}",
+    )
+    key = "temperature"
+    output[key] = st.slider(
+        key,
+        min_value=0.0,
+        max_value=2.0,
+        value=0.0,
+        key=f"{key_prefix}|{key}",
+    )
+    key = "max_output_tokens"
+    output[key] = st.slider(
+        key,
+        min_value=8192,
+        max_value=16_384,
+        key=f"{key_prefix}|{key}",
+    )
+    key = "should_escape_markdown"
+    output[key] = st.checkbox(
+        key,
+        value=False,
+        key=f"{key_prefix}|{key}",
+    )
+    key = "should_add_legis_urls"
+    output[key] = st.checkbox(
+        key,
+        value=True,
+        key=f"{key_prefix}|{key}",
+    )
+    return output
+def get_retrieval_config(key_prefix: str) -> dict:
+    """Render retrieval configuration form"""
+    output = {}
+    key = "n_ret_docs"
+    output[key] = st.slider(
+        "Number of chunks to retrieve",
+        min_value=1,
+        max_value=32,
+        value=8,
+        key=f"{key_prefix}|{key}",
+    )
+    key = "filter_legis_id"
+    output[key] = st.text_input("Bill ID (e.g. 118-s-2293)", key=f"{key_prefix}|{key}")
+    key = "filter_bioguide_id"
+    output[key] = st.text_input("Bioguide ID (e.g. R000595)", key=f"{key_prefix}|{key}")
+    key = "filter_congress_nums"
+    output[key] = st.multiselect(
+        "Congress Numbers",
+        CONGRESS_NUMBERS,
+        default=CONGRESS_NUMBERS[-2:],
+        key=f"{key_prefix}|{key}",
+    )
+    key = "filter_sponsor_parties"
+    output[key] = st.multiselect(
+        "Sponsor Party",
+        SPONSOR_PARTIES,
+        default=SPONSOR_PARTIES,
+        key=f"{key_prefix}|{key}",
+    )
+    return output

src/legisqa_local/components/sidebar.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Sidebar components for LegisQA"""
+import streamlit as st
+import os
+from legisqa_local.config.settings import get_chroma_config
+def render_chromadb_status():
+    """Render ChromaDB status in sidebar"""
+    st.subheader("🗄️ Vector Database")
+    try:
+        config = get_chroma_config()
+        chromadb_path = config["persist_directory"]
+        if os.path.exists(chromadb_path):
+            st.success("✅ ChromaDB Ready")
+            st.caption("📊 Using pre-existing database")
+            st.caption(f"📁 Collection: {config['collection_name']}")
+            st.caption(f"📁 Path: .../{os.path.basename(os.path.dirname(chromadb_path))}")
+        else:
+            st.error("❌ ChromaDB Not Found")
+            st.caption(f"Expected path: {chromadb_path}")
+            st.caption("Please check the database path")
+    except Exception as e:
+        st.error("❌ ChromaDB Configuration Error")
+        st.caption(f"Error: {str(e)[:50]}...")
+def render_outreach_links():
+    """Render links to external resources"""
+    nomic_base_url = "https://atlas.nomic.ai/data/gabrielhyperdemocracy"
+    nomic_map_name = "us-congressional-legislation-s1024o256nomic-1"
+    nomic_url = f"{nomic_base_url}/{nomic_map_name}/map"
+    hf_url = "https://huggingface.co/hyperdemocracy"
+    chroma_url = "https://www.trychroma.com/"
+    together_url = "https://www.together.ai/"
+    google_gemini_url = "https://ai.google.dev/gemini-api"
+    anthropic_url = "https://www.anthropic.com/api"
+    openai_url = "https://platform.openai.com/docs/overview"
+    langchain_url = "https://www.langchain.com/"
+    st.subheader(f":world_map: Visualize [nomic atlas]({nomic_url})")
+    st.subheader(f":hugging_face: Raw [huggingface datasets]({hf_url})")
+    st.subheader(f":card_file_box: Vector DB [chromadb]({chroma_url})")
+    st.subheader(f":pancakes: Inference [together.ai]({together_url})")
+    st.subheader(f":eyeglasses: Inference [google-gemini]({google_gemini_url})")
+    st.subheader(f":hut: Inference [anthropic]({anthropic_url})")
+    st.subheader(f":sparkles: Inference [openai]({openai_url})")
+    st.subheader(f":parrot: Orchestration [langchain]({langchain_url})")
+def render_sidebar():
+    """Render the complete sidebar"""
+    with st.container(border=True):
+        render_chromadb_status()
+    with st.container(border=True):
+        render_outreach_links()

src/legisqa_local/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Configuration module for LegisQA"""

src/legisqa_local/config/models.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Model configurations for different LLM providers"""
+CONGRESS_NUMBERS = [113, 114, 115, 116, 117, 118, 119]
+SPONSOR_PARTIES = ["D", "R", "L", "I"]
+OPENAI_CHAT_MODELS = {
+    "gpt-5-nano": {"cost": {"pmi": 0.05, "pmo": 0.40}},
+    "gpt-5-mini": {"cost": {"pmi": 0.25, "pmo": 2.00}},
+    "gpt-5": {"cost": {"pmi": 1.25, "pmo": 10.0}},
+    "gpt-4o-mini": {"cost": {"pmi": 0.15, "pmo": 0.60}},
+    "gpt-4o": {"cost": {"pmi": 2.50, "pmo": 10.0}},
+}
+ANTHROPIC_CHAT_MODELS = {
+    "claude-3-5-haiku-20241022": {"cost": {"pmi": 0.80, "pmo": 4.00}},
+    "claude-sonnet-4-20250514": {"cost": {"pmi": 3.0, "pmo": 15.0}},
+    "claude-opus-4-1-20250805": {"cost": {"pmi": 15.0, "pmo": 75.0}},
+}
+TOGETHER_CHAT_MODELS = {
+    "openai/gpt-oss-20b": {"cost": {"pmi": 0.05, "pmo": 0.20}},
+    "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free": {"cost": {"pmi": 0.00, "pmo": 0.00}},
+    "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": {"cost": {"pmi": 0.18, "pmo": 0.18}},
+    "meta-llama/Llama-3.3-70B-Instruct-Turbo": {"cost": {"pmi": 0.88, "pmo": 0.88}},
+    "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo": {"cost": {"pmi": 3.50, "pmo": 3.50}},
+    "Qwen/Qwen3-235B-A22B-Thinking-2507": {"cost": {"pmi": 0.65, "pmo": 3.00}},
+    "moonshotai/Kimi-K2-Instruct": {"cost": {"pmi": 1.00, "pmo": 3.00}},
+}
+GOOGLE_CHAT_MODELS = {
+    "gemini-2.5-flash-lite": {"cost": {"pmi": 0.10, "pmo": 0.40}},
+    "gemini-2.5-flash": {"cost": {"pmi": 0.30, "pmo": 2.50}},
+    "gemini-2.5-pro": {"cost": {"pmi": 1.25, "pmo": 10.0}},
+}
+PROVIDER_MODELS = {
+    "OpenAI": OPENAI_CHAT_MODELS,
+    "Anthropic": ANTHROPIC_CHAT_MODELS,
+    "Together": TOGETHER_CHAT_MODELS,
+    "Google": GOOGLE_CHAT_MODELS,
+}

src/legisqa_local/config/settings.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Application settings and configuration"""
+import os
+import streamlit as st
+# Streamlit configuration
+STREAMLIT_CONFIG = {
+    "layout": "wide",
+    "page_title": "LegisQA"
+}
+def get_secret(key: str, default=None):
+    """Get secret from Streamlit secrets or environment variables"""
+    try:
+        # Try Streamlit secrets first (for local development)
+        return st.secrets[key]
+    except (KeyError, FileNotFoundError):
+        # Fall back to environment variables (for Docker/HF Spaces)
+        return os.getenv(key, default)
+# Environment variables setup
+def setup_environment():
+    """Setup environment variables for the application"""
+    os.environ["LANGCHAIN_API_KEY"] = get_secret("langchain_api_key", "")
+    os.environ["LANGCHAIN_TRACING_V2"] = "true"
+    os.environ["LANGCHAIN_PROJECT"] = get_secret("langchain_project", "legisqa-local")
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# ChromaDB configuration
+def get_chroma_config():
+    """Get ChromaDB configuration from environment variables"""
+    return {
+        "persist_directory": os.getenv("CHROMA_PERSIST_DIRECTORY", "./chromadb"),
+        "collection_name": os.getenv("CHROMA_COLLECTION_NAME", "usc")
+    }
+# Embedding model configuration
+EMBEDDING_MODEL = "sentence-transformers/static-retrieval-mrl-en-v1"
+EMBEDDING_DEVICE = "cpu"

src/legisqa_local/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Core business logic for LegisQA"""

src/legisqa_local/core/embeddings.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Embedding functionality for LegisQA"""
+from langchain_huggingface import HuggingFaceEmbeddings
+from legisqa_local.config.settings import EMBEDDING_MODEL, EMBEDDING_DEVICE
+def load_embeddings():
+    """Load and return the embedding function"""
+    model_kwargs = {"device": EMBEDDING_DEVICE}
+    emb_fn = HuggingFaceEmbeddings(
+        model_name=EMBEDDING_MODEL,
+        model_kwargs=model_kwargs,
+    )
+    return emb_fn

src/legisqa_local/core/llm.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""LLM provider implementations for LegisQA"""
+import streamlit as st
+from langchain_openai import ChatOpenAI
+from langchain_anthropic import ChatAnthropic
+from langchain_together import ChatTogether
+from langchain_google_genai import ChatGoogleGenerativeAI
+from legisqa_local.config.settings import get_secret
+def get_llm(gen_config: dict):
+    """Get LLM instance based on configuration"""
+    match gen_config["provider"]:
+        case "OpenAI":
+            llm = ChatOpenAI(
+                model=gen_config["model_name"],
+                temperature=gen_config["temperature"],
+                api_key=get_secret("openai_api_key"),
+                max_tokens=gen_config["max_output_tokens"],
+            )
+        case "Anthropic":
+            llm = ChatAnthropic(
+                model_name=gen_config["model_name"],
+                temperature=gen_config["temperature"],
+                api_key=get_secret("anthropic_api_key"),
+                max_tokens_to_sample=gen_config["max_output_tokens"],
+            )
+        case "Together":
+            llm = ChatTogether(
+                model=gen_config["model_name"],
+                temperature=gen_config["temperature"],
+                max_tokens=gen_config["max_output_tokens"],
+                api_key=get_secret("together_api_key"),
+            )
+        case "Google":
+            llm = ChatGoogleGenerativeAI(
+                model=gen_config["model_name"],
+                temperature=gen_config["temperature"],
+                api_key=get_secret("google_api_key"),
+                max_output_tokens=gen_config["max_output_tokens"],
+            )
+        case _:
+            raise ValueError(f"Unknown provider: {gen_config['provider']}")
+    return llm

src/legisqa_local/core/rag.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""RAG (Retrieval-Augmented Generation) chain implementation"""
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough
+from legisqa_local.core.llm import get_llm
+from legisqa_local.core.vectorstore import load_vectorstore, get_vectorstore_filter
+from legisqa_local.utils.formatting import format_docs
+def create_rag_chain(llm, retriever):
+    """Create a RAG chain with the given LLM and retriever"""
+    QUERY_RAG_TEMPLATE = """You are an expert legislative analyst. Use the following excerpts from US congressional legislation to respond to the user's query. The excerpts are formatted as a JSON list. Each JSON object has "legis_id", "title", "introduced_date", "sponsor", and "snippets" keys. If a snippet is useful in writing part of your response, then cite the "legis_id", "title", "introduced_date", and "sponsor" in the response. When citing legis_id, use the same format as the excerpts (e.g. "116-hr-125"). If you don't know how to respond, just tell the user.
+---
+Congressional Legislation Excerpts:
+{context}
+---
+Query: {query}"""
+    prompt = ChatPromptTemplate.from_messages([
+        ("human", QUERY_RAG_TEMPLATE),
+    ])
+    rag_chain = (
+        RunnableParallel({
+            "docs": retriever,
+            "query": RunnablePassthrough(),
+        })
+        .assign(context=lambda x: format_docs(x["docs"]))
+        .assign(aimessage=prompt | llm)
+    )
+    return rag_chain
+def process_query(gen_config: dict, ret_config: dict, query: str):
+    """Process a query using RAG"""
+    vectorstore = load_vectorstore()
+    llm = get_llm(gen_config)
+    vs_filter = get_vectorstore_filter(ret_config)
+    # ChromaDB uses 'filter' parameter in search_kwargs
+    search_kwargs = {"k": ret_config["n_ret_docs"]}
+    if vs_filter:
+        search_kwargs["filter"] = vs_filter
+    retriever = vectorstore.as_retriever(search_kwargs=search_kwargs)
+    rag_chain = create_rag_chain(llm, retriever)
+    response = rag_chain.invoke(query)
+    return response

src/legisqa_local/core/vectorstore.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""Vector store operations for LegisQA"""
+from langchain_chroma import Chroma
+from legisqa_local.core.embeddings import load_embeddings
+from legisqa_local.config.settings import get_chroma_config
+def load_vectorstore():
+    """Load and return the ChromaDB vectorstore"""
+    config = get_chroma_config()
+    emb_fn = load_embeddings()
+    vectorstore = Chroma(
+        persist_directory=config["persist_directory"],
+        collection_name=config["collection_name"],
+        embedding_function=emb_fn,
+    )
+    return vectorstore
+def get_vectorstore_filter(ret_config: dict) -> dict:
+    """Get filter dict for ChromaDB queries"""
+    where_clause = {}
+    if ret_config["filter_legis_id"] != "":
+        where_clause["legis_id"] = ret_config["filter_legis_id"]
+    if ret_config["filter_congress_nums"]:
+        where_clause["congress_num"] = {"$in": ret_config["filter_congress_nums"]}
+    return where_clause

src/legisqa_local/tabs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Tab implementations for the Streamlit interface"""

src/legisqa_local/tabs/base.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Base tab interface for LegisQA"""
+from abc import ABC, abstractmethod
+class BaseTab(ABC):
+    """Base class for tab implementations"""
+    def __init__(self, name: str, key_prefix: str):
+        self.name = name
+        self.key_prefix = key_prefix
+    @abstractmethod
+    def render(self):
+        """Render the tab content"""
+        pass

src/legisqa_local/tabs/guide_tab.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""Guide tab implementation"""
+import streamlit as st
+from legisqa_local.tabs.base import BaseTab
+class GuideTab(BaseTab):
+    """Guide and documentation tab"""
+    def __init__(self):
+        super().__init__("Guide", "guide")
+    def render(self):
+        """Render the guide tab"""
+        st.write(
+            """
+# LegisQA Guide
+Welcome to LegisQA! This tool allows you to query congressional legislation using natural language.
+## How to Use
+1. **Choose a Tab**: Select between single RAG queries or side-by-side comparisons
+2. **Enter Your Query**: Ask questions about congressional legislation in natural language
+3. **Configure Settings**: Adjust model and retrieval parameters as needed
+4. **Submit**: Click submit and wait for the AI to generate a response
+## Example Queries
+- "What are the main themes around artificial intelligence legislation?"
+- "Write a summary of recent climate change bills"
+- "Create a table of healthcare reform proposals"
+- "What bills address social security reform?"
+## Features
+- **Multiple LLM Providers**: OpenAI, Anthropic, Together.ai, Google
+- **Flexible Retrieval**: Filter by congress number, bill ID, sponsor party
+- **Citation Support**: Responses include links to original legislation
+- **Cost Tracking**: Monitor API usage and costs
+- **Side-by-Side**: Compare responses from different models
+## Tips
+- Be specific in your queries for better results
+- Use the retrieval filters to narrow down the search space
+- Try different models to compare response quality
+- Check the retrieved chunks to understand the source material
+        """
+        )

src/legisqa_local/tabs/rag_sbs_tab.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Side-by-side RAG tab implementation"""
+import streamlit as st
+from legisqa_local.tabs.base import BaseTab
+from legisqa_local.components.forms import get_generative_config, get_retrieval_config
+from legisqa_local.components.display import render_response
+from legisqa_local.core.rag import process_query
+from legisqa_local.config.models import PROVIDER_MODELS
+class RAGSideBySideTab(BaseTab):
+    """Side-by-side RAG comparison tab"""
+    def __init__(self):
+        super().__init__("RAG (side-by-side)", "query_rag_sbs")
+    def render(self):
+        """Render the side-by-side RAG tab"""
+        SS = st.session_state
+        with st.form(f"{self.key_prefix}|query_form"):
+            query = st.text_area(
+                "Enter a query that can be answered with congressional legislation:"
+            )
+            cols = st.columns(2)
+            with cols[0]:
+                query_submitted = st.form_submit_button("Submit")
+            with cols[1]:
+                status_placeholder = st.empty()
+        grp1a, grp2a = st.columns(2)
+        gen_configs = {}
+        ret_configs = {}
+        with grp1a:
+            st.header("Group 1")
+            key_prefix = f"{self.key_prefix}|grp1"
+            with st.expander("Generative Config"):
+                gen_configs["grp1"] = get_generative_config(key_prefix)
+            with st.expander("Retrieval Config"):
+                ret_configs["grp1"] = get_retrieval_config(key_prefix)
+        with grp2a:
+            st.header("Group 2")
+            key_prefix = f"{self.key_prefix}|grp2"
+            with st.expander("Generative Config"):
+                gen_configs["grp2"] = get_generative_config(key_prefix)
+            with st.expander("Retrieval Config"):
+                ret_configs["grp2"] = get_retrieval_config(key_prefix)
+        grp1b, grp2b = st.columns(2)
+        sbs_cols = {"grp1": grp1b, "grp2": grp2b}
+        grp_names = {"grp1": "Group 1", "grp2": "Group 2"}
+        for post_key_prefix in ["grp1", "grp2"]:
+            with sbs_cols[post_key_prefix]:
+                key_prefix = f"{self.key_prefix}|{post_key_prefix}"
+                rkey = f"{key_prefix}|response"
+                if query_submitted:
+                    with status_placeholder:
+                        with st.spinner(
+                            "generating response for {}".format(grp_names[post_key_prefix])
+                        ):
+                            SS[rkey] = process_query(
+                                gen_configs[post_key_prefix],
+                                ret_configs[post_key_prefix],
+                                query,
+                            )
+                if response := SS.get(rkey):
+                    model_info = PROVIDER_MODELS[gen_configs[post_key_prefix]["provider"]][
+                        gen_configs[post_key_prefix]["model_name"]
+                    ]
+                    render_response(
+                        response,
+                        model_info,
+                        gen_configs[post_key_prefix]["provider"],
+                        gen_configs[post_key_prefix]["should_escape_markdown"],
+                        gen_configs[post_key_prefix]["should_add_legis_urls"],
+                        tag=grp_names[post_key_prefix],
+                    )

src/legisqa_local/tabs/rag_tab.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Single RAG tab implementation"""
+import streamlit as st
+from legisqa_local.tabs.base import BaseTab
+from legisqa_local.components.forms import get_generative_config, get_retrieval_config
+from legisqa_local.components.display import render_example_queries, render_response
+from legisqa_local.core.rag import process_query
+from legisqa_local.config.models import PROVIDER_MODELS
+class RAGTab(BaseTab):
+    """Single RAG query tab"""
+    def __init__(self):
+        super().__init__("RAG", "query_rag")
+    def render(self):
+        """Render the RAG tab"""
+        SS = st.session_state
+        render_example_queries()
+        with st.form(f"{self.key_prefix}|query_form"):
+            query = st.text_area(
+                "Enter a query that can be answered with congressional legislation:"
+            )
+            cols = st.columns(2)
+            with cols[0]:
+                query_submitted = st.form_submit_button("Submit")
+            with cols[1]:
+                status_placeholder = st.empty()
+        col1, col2 = st.columns(2)
+        with col1:
+            with st.expander("Generative Config"):
+                gen_config = get_generative_config(self.key_prefix)
+        with col2:
+            with st.expander("Retrieval Config"):
+                ret_config = get_retrieval_config(self.key_prefix)
+        rkey = f"{self.key_prefix}|response"
+        if query_submitted:
+            with status_placeholder:
+                with st.spinner("generating response"):
+                    SS[rkey] = process_query(gen_config, ret_config, query)
+        if response := SS.get(rkey):
+            model_info = PROVIDER_MODELS[gen_config["provider"]][gen_config["model_name"]]
+            render_response(
+                response,
+                model_info,
+                gen_config["provider"],
+                gen_config["should_escape_markdown"],
+                gen_config["should_add_legis_urls"],
+            )
+            with st.expander("Debug"):
+                st.write(response)

src/legisqa_local/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Utility functions and helpers"""

src/legisqa_local/utils/formatting.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Document formatting utilities for LegisQA"""
+from collections import defaultdict
+import json
+from langchain.schema import Document
+import streamlit as st
+from legisqa_local.utils.text import get_congress_gov_url, get_sponsor_url, escape_markdown
+def group_docs(docs) -> list[tuple[str, list[Document]]]:
+    """Group and sort docs by legis_id.
+    docs are grouped by legis_id
+    inside a legis_id group, the docs are sorted by start_index
+    overall the legis_id groups are sorted by number of docs (desc)
+    Returns:
+        doc_grps = [
+            (legis_id, start_index sorted docs), # group with the most docs
+            (legis_id, start_index sorted docs),
+            ...
+            (legis_id, start_index sorted docs), # group with the least docs
+        ]
+    """
+    doc_grps = defaultdict(list)
+    # create legis_id groups
+    for doc in docs:
+        doc_grps[doc.metadata["legis_id"]].append(doc)
+    # sort docs in each group by start index
+    for legis_id in doc_grps.keys():
+        doc_grps[legis_id] = sorted(
+            doc_grps[legis_id],
+            key=lambda x: x.metadata["start_index"],
+        )
+    # sort groups by number of docs
+    doc_grps = sorted(
+        tuple(doc_grps.items()),
+        key=lambda x: (
+            -len(x[1]),  # length of x[1] = number of chunks
+            x[0],  # legis_id for deterministic sort
+        ),
+    )
+    return doc_grps
+def format_docs(docs: list[Document]) -> str:
+    """Format documents as JSON for RAG context"""
+    doc_grps = group_docs(docs)
+    out = []
+    for legis_id, doc_grp in doc_grps:
+        dd = {
+            "legis_id": doc_grp[0].metadata["legis_id"],
+            "title": doc_grp[0].metadata["title"],
+            "introduced_date": doc_grp[0].metadata["introduced_date"],
+            "sponsor": doc_grp[0].metadata["sponsor_full_name"],
+            "snippets": [doc.page_content for doc in doc_grp],
+        }
+        out.append(dd)
+    return json.dumps(out, indent=4)
+def render_doc_grp(legis_id: str, doc_grp: list[Document]):
+    """Render a group of documents from the same legislation"""
+    first_doc = doc_grp[0]
+    congress_gov_url = get_congress_gov_url(
+        first_doc.metadata["congress_num"],
+        first_doc.metadata["legis_type"],
+        first_doc.metadata["legis_num"],
+    )
+    congress_gov_link = f"[congress.gov]({congress_gov_url})"
+    ref = "{} chunks from {}\n\n{}\n\n{}\n\n[{} ({}) ]({})".format(
+        len(doc_grp),
+        first_doc.metadata["legis_id"],
+        first_doc.metadata["title"],
+        congress_gov_link,
+        first_doc.metadata["sponsor_full_name"],
+        first_doc.metadata["sponsor_bioguide_id"],
+        get_sponsor_url(first_doc.metadata["sponsor_bioguide_id"]),
+    )
+    doc_contents = [
+        "[start_index={}] ".format(int(doc.metadata["start_index"])) + doc.page_content
+        for doc in doc_grp
+    ]
+    with st.expander(ref):
+        st.write(escape_markdown("\n\n...\n\n".join(doc_contents)))
+def render_retrieved_chunks(docs: list[Document], tag: str | None = None):
+    """Render all retrieved document chunks"""
+    with st.container(border=True):
+        doc_grps = group_docs(docs)
+        if tag is None:
+            st.write(
+                "Retrieved Chunks\n\nleft click to expand, right click to follow links"
+            )
+        else:
+            st.write(
+                f"Retrieved Chunks ({tag})\n\nleft click to expand, right click to follow links"
+            )
+        for legis_id, doc_grp in doc_grps:
+            render_doc_grp(legis_id, doc_grp)

src/legisqa_local/utils/text.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Text processing utilities for LegisQA"""
+import re
+CONGRESS_GOV_TYPE_MAP = {
+    "hconres": "house-concurrent-resolution",
+    "hjres": "house-joint-resolution",
+    "hr": "house-bill",
+    "hres": "house-resolution",
+    "s": "senate-bill",
+    "sconres": "senate-concurrent-resolution",
+    "sjres": "senate-joint-resolution",
+    "sres": "senate-resolution",
+}
+def escape_markdown(text: str) -> str:
+    """Escape markdown special characters in text"""
+    MD_SPECIAL_CHARS = r"\`*_{}[]()#+-.!$"
+    for char in MD_SPECIAL_CHARS:
+        text = text.replace(char, "\\" + char)
+    return text
+def get_sponsor_url(bioguide_id: str) -> str:
+    """Generate URL for a sponsor's bioguide page"""
+    return f"https://bioguide.congress.gov/search/bio/{bioguide_id}"
+def get_congress_gov_url(congress_num: int, legis_type: str, legis_num: int) -> str:
+    """Generate Congress.gov URL for a piece of legislation"""
+    lt = CONGRESS_GOV_TYPE_MAP[legis_type]
+    return f"https://www.congress.gov/bill/{int(congress_num)}th-congress/{lt}/{int(legis_num)}"
+def legis_id_to_link(legis_id: str) -> str:
+    """Convert a legislation ID to a Congress.gov URL"""
+    congress_num, legis_type, legis_num = legis_id.split("-")
+    return get_congress_gov_url(congress_num, legis_type, legis_num)
+def legis_id_match_to_link(matchobj):
+    """Convert a regex match object to a markdown link"""
+    mstring = matchobj.string[matchobj.start() : matchobj.end()]
+    url = legis_id_to_link(mstring)
+    link = f"[{mstring}]({url})"
+    return link
+def replace_legis_ids_with_urls(text: str) -> str:
+    """Replace legislation IDs in text with markdown links"""
+    pattern = "11[345678]-[a-z]+-\\d{1,5}"
+    rtext = re.sub(pattern, legis_id_match_to_link, text)
+    return rtext

src/legisqa_local/utils/usage.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Usage tracking utilities for LegisQA"""
+import streamlit as st
+from langchain_core.messages import AIMessage
+def get_token_usage_for_provider(aimessage: AIMessage, model_info: dict, provider: str):
+    """Get token usage information for any provider"""
+    input_tokens = aimessage.usage_metadata["input_tokens"]
+    output_tokens = aimessage.usage_metadata["output_tokens"]
+    cost = (
+        input_tokens * 1e-6 * model_info["cost"]["pmi"]
+        + output_tokens * 1e-6 * model_info["cost"]["pmo"]
+    )
+    return {
+        "input_tokens": input_tokens,
+        "output_tokens": output_tokens,
+        "cost": cost,
+    }
+def get_token_usage(aimessage: AIMessage, model_info: dict, provider: str):
+    """Get token usage based on provider"""
+    # All providers use the same calculation now
+    return get_token_usage_for_provider(aimessage, model_info, provider)
+def display_api_usage(
+    aimessage: AIMessage, model_info: dict, provider: str, tag: str | None = None
+):
+    """Display API usage information in Streamlit"""
+    with st.container(border=True):
+        if tag is None:
+            st.write("API Usage")
+        else:
+            st.write(f"API Usage ({tag})")
+        token_usage = get_token_usage(aimessage, model_info, provider)
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Input Tokens", token_usage["input_tokens"])
+        with col2:
+            st.metric("Output Tokens", token_usage["output_tokens"])
+        with col3:
+            st.metric("Cost", f"${token_usage['cost']:.4f}")
+        with st.expander("AIMessage Metadata"):
+            dd = {key: val for key, val in aimessage.dict().items() if key != "content"}
+            st.write(dd)