Spaces:

Agents-MCP-Hackathon
/

hackathon-census-mcp-server

Sleeping

App Files Files Community

abrezey commited on Jun 7

Commit

98c76e4

1 Parent(s): 90e1bbf

first commit

Browse files

Files changed (13) hide show

.gitattributes +1 -0
.gitignore +196 -0
app.py +97 -0
mcp_functions/__init__.py +0 -0
mcp_functions/census_api_calls.py +142 -0
mcp_functions/census_api_docs.py +54 -0
mcp_functions/census_utils.py +36 -0
mcp_functions/utils.py +161 -0
requirements.txt +5 -0
vector_databases/__init__.py +0 -0
vector_databases/census_dhc_dp_techdoc/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf +3 -0
vector_databases/census_dhc_dp_techdoc/__init__.py +0 -0
vector_databases/census_dhc_dp_techdoc/census_dhc_dp_techdoc.py +23 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,196 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the enitre vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+.gradio

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import gradio as gr
+from mcp_functions.census_api_calls import (
+    decennial_2020_demographic_profile,
+    decennial_2020_demographic_profile_fips_lookup,
+)
+from mcp_functions.census_api_docs import (
+    import_decennial_2020_datasets_homepage,
+    import_decennial_2020_demographic_profile_geographies,
+    import_decennial_2020_demographic_profile_variables,
+)
+from mcp_functions.census_utils import (
+    decennial_2020_dhc_semantic_search,
+    required_geograpy_hierarchy_parents,
+)
+decennial_2020_demographic_and_housing_characteristics_semantic_search_interface = gr.Interface(
+    fn=decennial_2020_dhc_semantic_search,
+    inputs=["textbox"],
+    outputs=gr.JSON(),
+    title="U.S. Census Bureau 2020 demographic and housing characteristics documentation",
+    description="Fetches demographic and housing characteristics documentation for the 2020 U.S. Census Bureau decennial census.",
+)
+decennial_2020_demographic_profile_interface = gr.Interface(
+    fn=decennial_2020_demographic_profile,
+    inputs=["textbox", "textbox", "textbox"],
+    outputs=gr.JSON(),
+    title="U.S. Census Bureau 2020 Demographic Profile data",
+    description="Fetches demographic profile data from the 2020 U.S. Census Bureau decennial API.",
+)
+decennial_2020_demographic_profile_fips_lookup_interface = gr.Interface(
+    fn=decennial_2020_demographic_profile_fips_lookup,
+    inputs=["textbox", "textbox", "textbox"],
+    outputs=gr.JSON(),
+    title="FIPS code lookup for the U.S. Census Bureau 2020 decennial census demographic profile dataset",
+    description="Lookup FIPS codes for geography hierarchies provided by the U.S. Census Bureau 2020 decennial census demographic profile dataset",
+)
+decennial_2020_demographic_profile_geographies_required_parent_geographies_interface = gr.Interface(
+    fn=required_geograpy_hierarchy_parents,
+    inputs=["textbox"],
+    outputs=gr.JSON(),
+    title="Geography Hierarchy required parent geographies",
+    description="Utility function that provides required parent geographies when requesting geography hierarchies during U.S. Census Bureau 2020 decennial census demographic profile API calls",
+)
+decennial_2020_demographic_profile_geographies_interface = gr.Interface(
+    fn=import_decennial_2020_demographic_profile_geographies,
+    inputs=[],
+    outputs=gr.TextArea(),
+    title="U.S. Census Bureau 2020 decennial census demographic profile dataset geographies",
+    description="Information on available geographies for the the U.S. Census Bureau 2020 decennial census demographic profile API.",
+)
+decennial_2020_demographic_profile_variables_interface = gr.Interface(
+    fn=import_decennial_2020_demographic_profile_variables,
+    inputs=[],
+    outputs=gr.TextArea(),
+    title="U.S. Census Bureau 2020 decennial census demographic profile dataset variables",
+    description="Information on available variables for the the U.S. Census Bureau 2020 decennial census demographic profile API.",
+)
+decennial_2020_datasets_homepage_interface = gr.Interface(
+    fn=import_decennial_2020_datasets_homepage,
+    inputs=[],
+    outputs=gr.TextArea(),
+    title="U.S. Census Bureau 2020 decennial census datasets",
+    description="Recieve information on available datasets as well as links to helpful documentation",
+)
+demo = gr.TabbedInterface(
+    [
+        decennial_2020_datasets_homepage_interface,
+        decennial_2020_demographic_profile_geographies_interface,
+        decennial_2020_demographic_profile_variables_interface,
+        decennial_2020_demographic_profile_geographies_required_parent_geographies_interface,
+        decennial_2020_demographic_profile_fips_lookup_interface,
+        decennial_2020_demographic_profile_interface,
+        decennial_2020_demographic_and_housing_characteristics_semantic_search_interface,
+    ],
+    [
+        "2020 U.S. Census Bureau decennial census API Homepage",
+        "2020 U.S. Census Bureau decennial census demographic profile API geographies",
+        "2020 U.S. Census Bureau decennial census demographic profile API variables",
+        "2020 U.S. Census Bureau decennial census demographic profile geography hierarchy required parent geographies",
+        "2020 U.S. Census Bureau decennial census demographic profile geography hierarchy FIPS code lookup",
+        "2020 U.S. Census Bureau decennial census demographic profile API data",
+        "2020 U.S. Census Bureau decennial census demographic and housing characteristics documentation",
+    ],
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

mcp_functions/__init__.py ADDED Viewed

File without changes

mcp_functions/census_api_calls.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import ast
+import os
+from collections import defaultdict
+from typing import List, Tuple
+import requests
+from dotenv import load_dotenv
+from mcp_functions.utils import build_fips_lookup, find_required_parent_geographies
+def parse_required_parent_geographies_string(data_str: str) -> List[Tuple[str, str]]:
+    try:
+        result = ast.literal_eval(data_str)
+        # Validate structure
+        if not isinstance(result, list):
+            raise ValueError("Parsed required parent geographies string is not a list.")
+        for item in result:
+            if not (
+                isinstance(item, tuple)
+                and len(item) == 2
+                and all(isinstance(elem, str) for elem in item)
+            ):
+                raise ValueError(
+                    "Each item in the parse required parent geographies list must be a tuple of two strings."
+                )
+        return result
+    except (SyntaxError, ValueError) as e:
+        raise ValueError(
+            f"Failed to parse input string into List[Tuple[str, str]]: {e}"
+        )
+load_dotenv()  # Loads variables from a .env file into os.environ
+API_KEY: str | None = os.getenv("CENSUS_API_KEY", None)
+if not API_KEY:
+    # If no API key is found, immediately raise an error to stop execution
+    raise ValueError("Set a CENSUS_API_KEY environment variable")
+def decennial_2020_demographic_profile_fips_lookup(
+    geography_hierarchy: str,
+    name: str,
+    required_parent_geographies: str,
+):
+    """
+    Fetches FIPS code for a given geography hierarchy and name. Also returns FIPS code for any parent geographies.
+    Args:
+        geography_hierarchy (str): The geographic level to query (e.g. 'region', 'state', 'county', etc.).
+        name (str): The name of the geographic entity (e.g., 'California', 'Los Angeles County, California').
+        required_parent_geographies (str): A string representing required parent geographies and their FIPS codes in the format "[('<geography hierarchy>', '<FIPS code>'), ('<geography hierarchy>', '<FIPS code>')]"
+    Returns:
+        Dict[str, str]: dictionary representing FIPS code values for provided geography_hierarchy.
+    """
+    BASE_URL = "https://api.census.gov/data/2020/dec/dp"
+    variables = ["NAME"]
+    for_clause = f"{geography_hierarchy}:*"
+    params = {"get": variables, "for": for_clause, "key": API_KEY}
+    ###################
+    # Parse parent geographies into API friendly string
+    ###################
+    parsed_parent_geographies: List[Tuple[str, str]] = (
+        parse_required_parent_geographies_string(required_parent_geographies)
+    )
+    if parsed_parent_geographies:
+        # Group values by key
+        grouped = defaultdict(list)
+        for key, value in parsed_parent_geographies:
+            grouped[key].append(value)
+        # Build the final string
+        result = " ".join(f"{key}:{','.join(grouped[key])}" for key in grouped)
+        params["in"] = result
+    try:
+        response = requests.get(BASE_URL, params=params)
+        # Store text in case of error
+        error_text = response.text
+        response.raise_for_status()
+        data = response.json()
+    except requests.RequestException as e:
+        if error_text == "error: unknown/unsupported geography hierarchy":
+            raise ValueError(
+                "Invalid geography hierarchy provided.",
+                "Acceptable required_parent_geographies must be provided.",
+                f"{geography_hierarchy} requires the following parent geographies: {find_required_parent_geographies(
+                geography_hierarchy
+            )}",
+            )
+        raise RuntimeError(f"Failed to fetch data from the Census API: {e} ") from e
+    # Right now, build_fips_lookup builds a lookup table that includes the FIPS code for
+    # The requested geogrpahy and all of its parent geographies.
+    # If we only want the FIPS code for the requested geography, we can modify this function.
+    lookup = build_fips_lookup(data)
+    try:
+        return lookup[name]
+    except KeyError:
+        raise KeyError(
+            f"Could not find FIPS code for {name} in {geography_hierarchy}."
+            f"Perhaps you input the wrong name or geography_hierarchy?"
+            f"Try appending a geography to your name input like so: `name, state`"
+        )
+def decennial_2020_demographic_profile(
+    get_variables: str, for_clauses: str, in_clauses: str
+):
+    """
+    Fetches demographic profile data from the U.S. Census Bureau API.
+    Args:
+        get_variables (str): The Census variables to retreive, comma-separated (e.g., 'DP1_0001C', 'DP1_0001C,DP1_0003C').
+        for_clauses (str): The geographic level to query (e.g., 'us:*', 'state:06', 'state:04,06').
+        in_clauses (str): Higher-level geography for nested queries (e.g., 'state:06', 'state:06 county:037,038').
+    Returns:
+        list[dict]: Parsed response with column headers and row data as dictionaries.
+    """
+    BASE_URL = "https://api.census.gov/data/2020/dec/dp"
+    params = [
+        ("get", get_variables),
+        ("for", for_clauses),
+        ("in", in_clauses),
+        ("key", API_KEY),
+    ]
+    response = requests.get(BASE_URL, params=params)
+    response.raise_for_status()
+    data = response.json()
+    # Convert to list of dicts for easier handling
+    headers = data[0]
+    return [dict(zip(headers, row)) for row in data[1:]]

mcp_functions/census_api_docs.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import requests
+from markdownify import markdownify
+def import_decennial_2020_datasets_homepage() -> str:
+    """
+    Fetches the homepage for the the U.S. Census Bureau 2020 decennial census API housed at https://api.census.gov/data/2020/dec/dp.html.
+    Contains descriptions of available datasets.
+    Also includes links to additional helpful documentation such as available geographies and example API calls.
+    Args:
+    Returns:
+        str: The homepage in markdown format
+    """
+    response = requests.get("https://api.census.gov/data/2020/dec/dp.html")
+    return markdownify(response.text.strip())
+def import_decennial_2020_demographic_profile_geographies() -> str:
+    """
+    Fetches information on available geographies for the the U.S. Census Bureau 2020 decennial census demographic profile API housed at https://api.census.gov/data/2020/dec/dp/geography.html.
+    Includes:
+        * Geography Levels
+        * Geography Hierarchy.
+    Args:
+    Returns:
+        str: The information in markdown format
+    """
+    response = requests.get("https://api.census.gov/data/2020/dec/dp/geography.html")
+    return markdownify(response.text.strip())
+def import_decennial_2020_demographic_profile_variables() -> str:
+    """
+    Fetches information on available variables for the the U.S. Census Bureau 2020 decennial census demographic profile API housed at https://api.census.gov/data/2020/dec/dp/variables.html.
+        * "Name" -- used to access variable during API calls
+        * "Label" -- description of variable
+    Args:
+    Returns:
+        str: The information in markdown format
+    """
+    response = requests.get("https://api.census.gov/data/2020/dec/dp/variables.html")
+    return markdownify(response.text.strip())

mcp_functions/census_utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from typing import List
+from langchain_core.documents import Document
+from vector_databases.census_dhc_dp_techdoc import census_dhc_dp_techdoc
+from mcp_functions.utils import find_required_parent_geographies
+def required_geograpy_hierarchy_parents(geography_hierarchy: str) -> List[str | None]:
+    """
+    Given the intent to look up a geography hierarchy within from U.S. Census Bureau 2020 decennial census demographic profile API,
+    return the parent geographies that must be included
+    Args:
+        geography_hierarchy (str): The geographic level to query (e.g. 'region', 'state', 'county', 'principal city (or part)', etc.).
+    Returns:
+        List[str]: List of strings representing the required parent geographies.
+    """
+    return find_required_parent_geographies(geography_hierarchy)
+def decennial_2020_dhc_semantic_search(
+    query: str,
+) -> List[Document]:
+    """
+    Perform a semantic search on the 2020 Census Demographic and Housing Characteristics File (DHC) housed at https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/complete-tech-docs/demographic-and-housing-characteristics-file-and-demographic-profile/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf
+    Args:
+        query (str): The semantic query to perform.
+    Returns:
+        (List[Document]): The semantically related documents
+    """
+    docs = census_dhc_dp_techdoc.vector_store.similarity_search(query, k=4)
+    return docs

mcp_functions/utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from collections import deque
+geography_hierarchy_key = {
+    "us": {},
+    "region": {},
+    "division": {},
+    "state": {
+        "county": {
+            "county subdivision": {
+                "required_parent_hierarchies": ["state"],
+                "subminor civil division": {
+                    "required_parent_hierarchies": [
+                        "state",
+                        "county",
+                        "county subdivision",
+                    ],
+                },
+            },
+            "tract": {
+                "required_parent_hierarchies": ["state"],
+            },
+        },
+        "place": {},
+        "consolidated city": {},
+        "alaska native regional corporation": {},
+        "american indian area/alaska native area/hawaiian home land (or part)": {
+            "required_parent_hierarchies": ["state"],
+            "tribal subdivision/remainder (or part)": {
+                "required_parent_hierarchies": [
+                    "state",
+                    "american indian area/alaska native area/hawaiian home land (or part)",
+                ]
+            },
+        },
+        "metropolitan statistical area/micropolitan statistical area (or part)": {
+            "required_parent_hierarchies": ["state"],
+            "principal city (or part)": {
+                "required_parent_hierarchies": [
+                    "state",
+                    "metropolitan statistical area/micropolitan statistical area (or part)",
+                ]
+            },
+            "metropolitan division (or part)": {
+                "required_parent_hierarchies": [
+                    "state",
+                    "metropolitan statistical area/micropolitan statistical area (or part)",
+                ]
+            },
+        },
+        "combined statistical area (or part)": {
+            "required_parent_hierarchies": ["state"]
+        },
+        "combined new england city and town area (or part)": {
+            "required_parent_hierarchies": ["state"],
+        },
+        "new england city and town area (or part)": {
+            "required_parent_hierarchies": ["state"],
+            "principal city": {
+                "required_parent_hierarchies": [
+                    "state",
+                    "new england city and town area (or part)",
+                ]
+            },
+            "necta division (or part)": {
+                "required_parent_hierarchies": [
+                    "state",
+                    "new england city and town area (or part)",
+                ]
+            },
+        },
+        "congressional district": {},
+        "state legislative district (upper chamber)": {
+            "required_parent_hierarchies": ["state"]
+        },
+        "state legislative district (lower chamber)": {
+            "required_parent_hierarchies": ["state"]
+        },
+        "zip code tabulation area (or part)": {
+            "required_parent_hierarchies": ["state"]
+        },
+        "school district (elementary)": {},
+        "school district (secondary)": {},
+        "school district (unified)": {},
+    },
+    "american indian area/alaska native area/hawaiian home land": {
+        "tribal subdivision/remainder": {},
+        "tribal census tract": {
+            "required_parent_hierarchies": [
+                "american indian area/alaska native area/hawaiian home land"
+            ]
+        },
+    },
+    "metropolitan statistical area/micropolitan statistical area": {
+        "state (or part)": {
+            "principal city (or part)": {
+                "required_parent_hierarchies": [
+                    "metropolitan statistical area/micropolitan statistical area",
+                    "state (or part)",
+                ]
+            }
+        },
+        "metropolitan division": {
+            "required_parent_hierarchies": [
+                "metropolitan statistical area/micropolitan statistical area"
+            ]
+        },
+    },
+    "combined statistical area": {},
+    "combined new england city and town area": {},
+    "new england city and town area": {
+        "state (or part)": {
+            "principal city": {
+                "required_parent_hierarchies": [
+                    "new england city and town area",
+                    "state (or part)",
+                ]
+            }
+        },
+        "necta division": {
+            "required_parent_hierarchies": ["new england city and town area"]
+        },
+    },
+    "zip code tabulation area": {},
+}
+def find_required_parent_geographies(target_key: str) -> list[str | None]:
+    """ """
+    required_parent_hierarchies: list = []
+    queue = deque([(geography_hierarchy_key, None)])  # (current_dict, parent_key)
+    while queue:
+        current, _ = queue.popleft()
+        for key, value in current.items():
+            if key == target_key:
+                # Found the target
+                if isinstance(value, dict) and "required_parent_hierarchies" in value:
+                    required_parent_hierarchies = value["required_parent_hierarchies"]
+                    return required_parent_hierarchies
+                else:
+                    return required_parent_hierarchies  # Key found, but no required_in_clauses
+            if isinstance(value, dict):
+                queue.append((value, key))
+    return required_parent_hierarchies
+def build_fips_lookup(data: list[list[str]]) -> dict[str, dict[str, str]]:
+    """
+    I am unsure whether we want to include all of the geography hierarchy in the lookup.
+    Or just thhe specific geography hierarchy that is being queried.
+    """
+    header, *rows = data
+    # Build the lookup dictionary
+    return {
+        row[0]: {col: row[idx] for idx, col in enumerate(header) if idx != 0}
+        for row in rows
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio[mcp]
+langchain-community
+langchain-huggingface
+markdownify
+pypdf

vector_databases/__init__.py ADDED Viewed

File without changes

vector_databases/census_dhc_dp_techdoc/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed0e521a1c59a6e4653cd1e0192fe85c537f657aaca33525ece577efce412864
+size 1806867

vector_databases/census_dhc_dp_techdoc/__init__.py ADDED Viewed

File without changes

vector_databases/census_dhc_dp_techdoc/census_dhc_dp_techdoc.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+def load_pages():
+    loader = PyPDFLoader(
+        "vector_databases/census_dhc_dp_techdoc/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf"
+    )
+    pages = []
+    for page in loader.load():
+        pages.append(page)
+    return pages
+pages = load_pages()
+print("Vectorizing census documentation... This may take a few minutes")
+vector_store = InMemoryVectorStore.from_documents(
+    pages, HuggingFaceEmbeddings(model_name="thenlper/gte-small")
+)