abrezey commited on
Commit
98c76e4
·
1 Parent(s): 90e1bbf

first commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Abstra
171
+ # Abstra is an AI-powered process automation framework.
172
+ # Ignore directories containing user credentials, local state, and settings.
173
+ # Learn more at https://abstra.io/docs
174
+ .abstra/
175
+
176
+ # Visual Studio Code
177
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
178
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
179
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
180
+ # you could uncomment the following to ignore the enitre vscode folder
181
+ # .vscode/
182
+
183
+ # Ruff stuff:
184
+ .ruff_cache/
185
+
186
+ # PyPI configuration file
187
+ .pypirc
188
+
189
+ # Cursor
190
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
191
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
192
+ # refer to https://docs.cursor.com/context/ignore-files
193
+ .cursorignore
194
+ .cursorindexingignore
195
+
196
+ .gradio
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from mcp_functions.census_api_calls import (
3
+ decennial_2020_demographic_profile,
4
+ decennial_2020_demographic_profile_fips_lookup,
5
+ )
6
+ from mcp_functions.census_api_docs import (
7
+ import_decennial_2020_datasets_homepage,
8
+ import_decennial_2020_demographic_profile_geographies,
9
+ import_decennial_2020_demographic_profile_variables,
10
+ )
11
+ from mcp_functions.census_utils import (
12
+ decennial_2020_dhc_semantic_search,
13
+ required_geograpy_hierarchy_parents,
14
+ )
15
+
16
+ decennial_2020_demographic_and_housing_characteristics_semantic_search_interface = gr.Interface(
17
+ fn=decennial_2020_dhc_semantic_search,
18
+ inputs=["textbox"],
19
+ outputs=gr.JSON(),
20
+ title="U.S. Census Bureau 2020 demographic and housing characteristics documentation",
21
+ description="Fetches demographic and housing characteristics documentation for the 2020 U.S. Census Bureau decennial census.",
22
+ )
23
+
24
+ decennial_2020_demographic_profile_interface = gr.Interface(
25
+ fn=decennial_2020_demographic_profile,
26
+ inputs=["textbox", "textbox", "textbox"],
27
+ outputs=gr.JSON(),
28
+ title="U.S. Census Bureau 2020 Demographic Profile data",
29
+ description="Fetches demographic profile data from the 2020 U.S. Census Bureau decennial API.",
30
+ )
31
+
32
+ decennial_2020_demographic_profile_fips_lookup_interface = gr.Interface(
33
+ fn=decennial_2020_demographic_profile_fips_lookup,
34
+ inputs=["textbox", "textbox", "textbox"],
35
+ outputs=gr.JSON(),
36
+ title="FIPS code lookup for the U.S. Census Bureau 2020 decennial census demographic profile dataset",
37
+ description="Lookup FIPS codes for geography hierarchies provided by the U.S. Census Bureau 2020 decennial census demographic profile dataset",
38
+ )
39
+
40
+ decennial_2020_demographic_profile_geographies_required_parent_geographies_interface = gr.Interface(
41
+ fn=required_geograpy_hierarchy_parents,
42
+ inputs=["textbox"],
43
+ outputs=gr.JSON(),
44
+ title="Geography Hierarchy required parent geographies",
45
+ description="Utility function that provides required parent geographies when requesting geography hierarchies during U.S. Census Bureau 2020 decennial census demographic profile API calls",
46
+ )
47
+
48
+ decennial_2020_demographic_profile_geographies_interface = gr.Interface(
49
+ fn=import_decennial_2020_demographic_profile_geographies,
50
+ inputs=[],
51
+ outputs=gr.TextArea(),
52
+ title="U.S. Census Bureau 2020 decennial census demographic profile dataset geographies",
53
+ description="Information on available geographies for the the U.S. Census Bureau 2020 decennial census demographic profile API.",
54
+ )
55
+
56
+ decennial_2020_demographic_profile_variables_interface = gr.Interface(
57
+ fn=import_decennial_2020_demographic_profile_variables,
58
+ inputs=[],
59
+ outputs=gr.TextArea(),
60
+ title="U.S. Census Bureau 2020 decennial census demographic profile dataset variables",
61
+ description="Information on available variables for the the U.S. Census Bureau 2020 decennial census demographic profile API.",
62
+ )
63
+
64
+
65
+ decennial_2020_datasets_homepage_interface = gr.Interface(
66
+ fn=import_decennial_2020_datasets_homepage,
67
+ inputs=[],
68
+ outputs=gr.TextArea(),
69
+ title="U.S. Census Bureau 2020 decennial census datasets",
70
+ description="Recieve information on available datasets as well as links to helpful documentation",
71
+ )
72
+
73
+
74
+ demo = gr.TabbedInterface(
75
+ [
76
+ decennial_2020_datasets_homepage_interface,
77
+ decennial_2020_demographic_profile_geographies_interface,
78
+ decennial_2020_demographic_profile_variables_interface,
79
+ decennial_2020_demographic_profile_geographies_required_parent_geographies_interface,
80
+ decennial_2020_demographic_profile_fips_lookup_interface,
81
+ decennial_2020_demographic_profile_interface,
82
+ decennial_2020_demographic_and_housing_characteristics_semantic_search_interface,
83
+ ],
84
+ [
85
+ "2020 U.S. Census Bureau decennial census API Homepage",
86
+ "2020 U.S. Census Bureau decennial census demographic profile API geographies",
87
+ "2020 U.S. Census Bureau decennial census demographic profile API variables",
88
+ "2020 U.S. Census Bureau decennial census demographic profile geography hierarchy required parent geographies",
89
+ "2020 U.S. Census Bureau decennial census demographic profile geography hierarchy FIPS code lookup",
90
+ "2020 U.S. Census Bureau decennial census demographic profile API data",
91
+ "2020 U.S. Census Bureau decennial census demographic and housing characteristics documentation",
92
+ ],
93
+ )
94
+
95
+
96
+ if __name__ == "__main__":
97
+ demo.launch(mcp_server=True)
mcp_functions/__init__.py ADDED
File without changes
mcp_functions/census_api_calls.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import List, Tuple
5
+
6
+ import requests
7
+ from dotenv import load_dotenv
8
+
9
+ from mcp_functions.utils import build_fips_lookup, find_required_parent_geographies
10
+
11
+
12
+ def parse_required_parent_geographies_string(data_str: str) -> List[Tuple[str, str]]:
13
+ try:
14
+ result = ast.literal_eval(data_str)
15
+
16
+ # Validate structure
17
+ if not isinstance(result, list):
18
+ raise ValueError("Parsed required parent geographies string is not a list.")
19
+ for item in result:
20
+ if not (
21
+ isinstance(item, tuple)
22
+ and len(item) == 2
23
+ and all(isinstance(elem, str) for elem in item)
24
+ ):
25
+ raise ValueError(
26
+ "Each item in the parse required parent geographies list must be a tuple of two strings."
27
+ )
28
+
29
+ return result
30
+
31
+ except (SyntaxError, ValueError) as e:
32
+ raise ValueError(
33
+ f"Failed to parse input string into List[Tuple[str, str]]: {e}"
34
+ )
35
+
36
+
37
+ load_dotenv() # Loads variables from a .env file into os.environ
38
+ API_KEY: str | None = os.getenv("CENSUS_API_KEY", None)
39
+ if not API_KEY:
40
+ # If no API key is found, immediately raise an error to stop execution
41
+ raise ValueError("Set a CENSUS_API_KEY environment variable")
42
+
43
+
44
+ def decennial_2020_demographic_profile_fips_lookup(
45
+ geography_hierarchy: str,
46
+ name: str,
47
+ required_parent_geographies: str,
48
+ ):
49
+ """
50
+ Fetches FIPS code for a given geography hierarchy and name. Also returns FIPS code for any parent geographies.
51
+
52
+ Args:
53
+ geography_hierarchy (str): The geographic level to query (e.g. 'region', 'state', 'county', etc.).
54
+ name (str): The name of the geographic entity (e.g., 'California', 'Los Angeles County, California').
55
+ required_parent_geographies (str): A string representing required parent geographies and their FIPS codes in the format "[('<geography hierarchy>', '<FIPS code>'), ('<geography hierarchy>', '<FIPS code>')]"
56
+ Returns:
57
+ Dict[str, str]: dictionary representing FIPS code values for provided geography_hierarchy.
58
+ """
59
+
60
+ BASE_URL = "https://api.census.gov/data/2020/dec/dp"
61
+
62
+ variables = ["NAME"]
63
+ for_clause = f"{geography_hierarchy}:*"
64
+ params = {"get": variables, "for": for_clause, "key": API_KEY}
65
+
66
+ ###################
67
+ # Parse parent geographies into API friendly string
68
+ ###################
69
+ parsed_parent_geographies: List[Tuple[str, str]] = (
70
+ parse_required_parent_geographies_string(required_parent_geographies)
71
+ )
72
+ if parsed_parent_geographies:
73
+ # Group values by key
74
+ grouped = defaultdict(list)
75
+ for key, value in parsed_parent_geographies:
76
+ grouped[key].append(value)
77
+ # Build the final string
78
+ result = " ".join(f"{key}:{','.join(grouped[key])}" for key in grouped)
79
+ params["in"] = result
80
+
81
+ try:
82
+ response = requests.get(BASE_URL, params=params)
83
+ # Store text in case of error
84
+ error_text = response.text
85
+ response.raise_for_status()
86
+ data = response.json()
87
+ except requests.RequestException as e:
88
+ if error_text == "error: unknown/unsupported geography hierarchy":
89
+ raise ValueError(
90
+ "Invalid geography hierarchy provided.",
91
+ "Acceptable required_parent_geographies must be provided.",
92
+ f"{geography_hierarchy} requires the following parent geographies: {find_required_parent_geographies(
93
+ geography_hierarchy
94
+ )}",
95
+ )
96
+ raise RuntimeError(f"Failed to fetch data from the Census API: {e} ") from e
97
+
98
+ # Right now, build_fips_lookup builds a lookup table that includes the FIPS code for
99
+ # The requested geogrpahy and all of its parent geographies.
100
+ # If we only want the FIPS code for the requested geography, we can modify this function.
101
+ lookup = build_fips_lookup(data)
102
+
103
+ try:
104
+ return lookup[name]
105
+ except KeyError:
106
+ raise KeyError(
107
+ f"Could not find FIPS code for {name} in {geography_hierarchy}."
108
+ f"Perhaps you input the wrong name or geography_hierarchy?"
109
+ f"Try appending a geography to your name input like so: `name, state`"
110
+ )
111
+
112
+
113
+ def decennial_2020_demographic_profile(
114
+ get_variables: str, for_clauses: str, in_clauses: str
115
+ ):
116
+ """
117
+ Fetches demographic profile data from the U.S. Census Bureau API.
118
+
119
+ Args:
120
+ get_variables (str): The Census variables to retreive, comma-separated (e.g., 'DP1_0001C', 'DP1_0001C,DP1_0003C').
121
+ for_clauses (str): The geographic level to query (e.g., 'us:*', 'state:06', 'state:04,06').
122
+ in_clauses (str): Higher-level geography for nested queries (e.g., 'state:06', 'state:06 county:037,038').
123
+
124
+ Returns:
125
+ list[dict]: Parsed response with column headers and row data as dictionaries.
126
+ """
127
+ BASE_URL = "https://api.census.gov/data/2020/dec/dp"
128
+
129
+ params = [
130
+ ("get", get_variables),
131
+ ("for", for_clauses),
132
+ ("in", in_clauses),
133
+ ("key", API_KEY),
134
+ ]
135
+
136
+ response = requests.get(BASE_URL, params=params)
137
+ response.raise_for_status()
138
+ data = response.json()
139
+
140
+ # Convert to list of dicts for easier handling
141
+ headers = data[0]
142
+ return [dict(zip(headers, row)) for row in data[1:]]
mcp_functions/census_api_docs.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from markdownify import markdownify
3
+
4
+
5
+ def import_decennial_2020_datasets_homepage() -> str:
6
+ """
7
+ Fetches the homepage for the the U.S. Census Bureau 2020 decennial census API housed at https://api.census.gov/data/2020/dec/dp.html.
8
+ Contains descriptions of available datasets.
9
+ Also includes links to additional helpful documentation such as available geographies and example API calls.
10
+
11
+ Args:
12
+
13
+ Returns:
14
+ str: The homepage in markdown format
15
+ """
16
+
17
+ response = requests.get("https://api.census.gov/data/2020/dec/dp.html")
18
+
19
+ return markdownify(response.text.strip())
20
+
21
+
22
+ def import_decennial_2020_demographic_profile_geographies() -> str:
23
+ """
24
+ Fetches information on available geographies for the the U.S. Census Bureau 2020 decennial census demographic profile API housed at https://api.census.gov/data/2020/dec/dp/geography.html.
25
+ Includes:
26
+ * Geography Levels
27
+ * Geography Hierarchy.
28
+
29
+ Args:
30
+
31
+ Returns:
32
+ str: The information in markdown format
33
+ """
34
+
35
+ response = requests.get("https://api.census.gov/data/2020/dec/dp/geography.html")
36
+
37
+ return markdownify(response.text.strip())
38
+
39
+
40
+ def import_decennial_2020_demographic_profile_variables() -> str:
41
+ """
42
+ Fetches information on available variables for the the U.S. Census Bureau 2020 decennial census demographic profile API housed at https://api.census.gov/data/2020/dec/dp/variables.html.
43
+ * "Name" -- used to access variable during API calls
44
+ * "Label" -- description of variable
45
+
46
+ Args:
47
+
48
+ Returns:
49
+ str: The information in markdown format
50
+ """
51
+
52
+ response = requests.get("https://api.census.gov/data/2020/dec/dp/variables.html")
53
+
54
+ return markdownify(response.text.strip())
mcp_functions/census_utils.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from langchain_core.documents import Document
4
+ from vector_databases.census_dhc_dp_techdoc import census_dhc_dp_techdoc
5
+
6
+ from mcp_functions.utils import find_required_parent_geographies
7
+
8
+
9
+ def required_geograpy_hierarchy_parents(geography_hierarchy: str) -> List[str | None]:
10
+ """
11
+ Given the intent to look up a geography hierarchy within from U.S. Census Bureau 2020 decennial census demographic profile API,
12
+ return the parent geographies that must be included
13
+
14
+ Args:
15
+ geography_hierarchy (str): The geographic level to query (e.g. 'region', 'state', 'county', 'principal city (or part)', etc.).
16
+ Returns:
17
+ List[str]: List of strings representing the required parent geographies.
18
+ """
19
+
20
+ return find_required_parent_geographies(geography_hierarchy)
21
+
22
+
23
+ def decennial_2020_dhc_semantic_search(
24
+ query: str,
25
+ ) -> List[Document]:
26
+ """
27
+ Perform a semantic search on the 2020 Census Demographic and Housing Characteristics File (DHC) housed at https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/complete-tech-docs/demographic-and-housing-characteristics-file-and-demographic-profile/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf
28
+
29
+ Args:
30
+ query (str): The semantic query to perform.
31
+ Returns:
32
+ (List[Document]): The semantically related documents
33
+ """
34
+ docs = census_dhc_dp_techdoc.vector_store.similarity_search(query, k=4)
35
+
36
+ return docs
mcp_functions/utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+
3
+ geography_hierarchy_key = {
4
+ "us": {},
5
+ "region": {},
6
+ "division": {},
7
+ "state": {
8
+ "county": {
9
+ "county subdivision": {
10
+ "required_parent_hierarchies": ["state"],
11
+ "subminor civil division": {
12
+ "required_parent_hierarchies": [
13
+ "state",
14
+ "county",
15
+ "county subdivision",
16
+ ],
17
+ },
18
+ },
19
+ "tract": {
20
+ "required_parent_hierarchies": ["state"],
21
+ },
22
+ },
23
+ "place": {},
24
+ "consolidated city": {},
25
+ "alaska native regional corporation": {},
26
+ "american indian area/alaska native area/hawaiian home land (or part)": {
27
+ "required_parent_hierarchies": ["state"],
28
+ "tribal subdivision/remainder (or part)": {
29
+ "required_parent_hierarchies": [
30
+ "state",
31
+ "american indian area/alaska native area/hawaiian home land (or part)",
32
+ ]
33
+ },
34
+ },
35
+ "metropolitan statistical area/micropolitan statistical area (or part)": {
36
+ "required_parent_hierarchies": ["state"],
37
+ "principal city (or part)": {
38
+ "required_parent_hierarchies": [
39
+ "state",
40
+ "metropolitan statistical area/micropolitan statistical area (or part)",
41
+ ]
42
+ },
43
+ "metropolitan division (or part)": {
44
+ "required_parent_hierarchies": [
45
+ "state",
46
+ "metropolitan statistical area/micropolitan statistical area (or part)",
47
+ ]
48
+ },
49
+ },
50
+ "combined statistical area (or part)": {
51
+ "required_parent_hierarchies": ["state"]
52
+ },
53
+ "combined new england city and town area (or part)": {
54
+ "required_parent_hierarchies": ["state"],
55
+ },
56
+ "new england city and town area (or part)": {
57
+ "required_parent_hierarchies": ["state"],
58
+ "principal city": {
59
+ "required_parent_hierarchies": [
60
+ "state",
61
+ "new england city and town area (or part)",
62
+ ]
63
+ },
64
+ "necta division (or part)": {
65
+ "required_parent_hierarchies": [
66
+ "state",
67
+ "new england city and town area (or part)",
68
+ ]
69
+ },
70
+ },
71
+ "congressional district": {},
72
+ "state legislative district (upper chamber)": {
73
+ "required_parent_hierarchies": ["state"]
74
+ },
75
+ "state legislative district (lower chamber)": {
76
+ "required_parent_hierarchies": ["state"]
77
+ },
78
+ "zip code tabulation area (or part)": {
79
+ "required_parent_hierarchies": ["state"]
80
+ },
81
+ "school district (elementary)": {},
82
+ "school district (secondary)": {},
83
+ "school district (unified)": {},
84
+ },
85
+ "american indian area/alaska native area/hawaiian home land": {
86
+ "tribal subdivision/remainder": {},
87
+ "tribal census tract": {
88
+ "required_parent_hierarchies": [
89
+ "american indian area/alaska native area/hawaiian home land"
90
+ ]
91
+ },
92
+ },
93
+ "metropolitan statistical area/micropolitan statistical area": {
94
+ "state (or part)": {
95
+ "principal city (or part)": {
96
+ "required_parent_hierarchies": [
97
+ "metropolitan statistical area/micropolitan statistical area",
98
+ "state (or part)",
99
+ ]
100
+ }
101
+ },
102
+ "metropolitan division": {
103
+ "required_parent_hierarchies": [
104
+ "metropolitan statistical area/micropolitan statistical area"
105
+ ]
106
+ },
107
+ },
108
+ "combined statistical area": {},
109
+ "combined new england city and town area": {},
110
+ "new england city and town area": {
111
+ "state (or part)": {
112
+ "principal city": {
113
+ "required_parent_hierarchies": [
114
+ "new england city and town area",
115
+ "state (or part)",
116
+ ]
117
+ }
118
+ },
119
+ "necta division": {
120
+ "required_parent_hierarchies": ["new england city and town area"]
121
+ },
122
+ },
123
+ "zip code tabulation area": {},
124
+ }
125
+
126
+
127
+ def find_required_parent_geographies(target_key: str) -> list[str | None]:
128
+ """ """
129
+ required_parent_hierarchies: list = []
130
+
131
+ queue = deque([(geography_hierarchy_key, None)]) # (current_dict, parent_key)
132
+
133
+ while queue:
134
+ current, _ = queue.popleft()
135
+
136
+ for key, value in current.items():
137
+ if key == target_key:
138
+ # Found the target
139
+ if isinstance(value, dict) and "required_parent_hierarchies" in value:
140
+ required_parent_hierarchies = value["required_parent_hierarchies"]
141
+ return required_parent_hierarchies
142
+ else:
143
+ return required_parent_hierarchies # Key found, but no required_in_clauses
144
+ if isinstance(value, dict):
145
+ queue.append((value, key))
146
+
147
+ return required_parent_hierarchies
148
+
149
+
150
+ def build_fips_lookup(data: list[list[str]]) -> dict[str, dict[str, str]]:
151
+ """
152
+ I am unsure whether we want to include all of the geography hierarchy in the lookup.
153
+ Or just thhe specific geography hierarchy that is being queried.
154
+ """
155
+ header, *rows = data
156
+
157
+ # Build the lookup dictionary
158
+ return {
159
+ row[0]: {col: row[idx] for idx, col in enumerate(header) if idx != 0}
160
+ for row in rows
161
+ }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio[mcp]
2
+ langchain-community
3
+ langchain-huggingface
4
+ markdownify
5
+ pypdf
vector_databases/__init__.py ADDED
File without changes
vector_databases/census_dhc_dp_techdoc/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed0e521a1c59a6e4653cd1e0192fe85c537f657aaca33525ece577efce412864
3
+ size 1806867
vector_databases/census_dhc_dp_techdoc/__init__.py ADDED
File without changes
vector_databases/census_dhc_dp_techdoc/census_dhc_dp_techdoc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_core.vectorstores import InMemoryVectorStore
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+
5
+
6
+ def load_pages():
7
+ loader = PyPDFLoader(
8
+ "vector_databases/census_dhc_dp_techdoc/2020census-demographic-and-housing-characteristics-file-and-demographic-profile-techdoc.pdf"
9
+ )
10
+ pages = []
11
+ for page in loader.load():
12
+ pages.append(page)
13
+
14
+ return pages
15
+
16
+
17
+ pages = load_pages()
18
+
19
+
20
+ print("Vectorizing census documentation... This may take a few minutes")
21
+ vector_store = InMemoryVectorStore.from_documents(
22
+ pages, HuggingFaceEmbeddings(model_name="thenlper/gte-small")
23
+ )