cboettig commited on
Commit
1f67ded
·
1 Parent(s): 9bbf13a
Files changed (4) hide show
  1. README.md +1 -4
  2. requirements.txt +0 -5
  3. src/app.py +2 -0
  4. tutorial.ipynb +120 -208
README.md CHANGED
@@ -12,9 +12,6 @@ short_description: Human-Wildlife Conflict LLM
12
  license: bsd
13
  ---
14
 
15
- # Welcome to Streamlit!
16
 
17
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
 
19
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
- forums](https://discuss.streamlit.io).
 
12
  license: bsd
13
  ---
14
 
15
+ View app: <https://huggingface.co/spaces/boettiger-lab/hwc-llm>
16
 
 
17
 
 
 
requirements.txt CHANGED
@@ -1,11 +1,6 @@
1
  streamlit
2
- langchain-chroma
3
- bs4
4
- langchain
5
- langchain-chroma
6
  langchain-community
7
  langchain-core
8
- langchain-core
9
  langchain_openai
10
  langchain-text-splitters
11
  requests
 
1
  streamlit
 
 
 
 
2
  langchain-community
3
  langchain-core
 
4
  langchain_openai
5
  langchain-text-splitters
6
  requests
src/app.py CHANGED
@@ -115,3 +115,5 @@ if prompt := st.chat_input("What are the most cost-effective prevention methods
115
  # https://python.langchain.com/docs/tutorials/qa_chat_history/
116
 
117
  # Also see structured outputs.
 
 
 
115
  # https://python.langchain.com/docs/tutorials/qa_chat_history/
116
 
117
  # Also see structured outputs.
118
+
119
+
tutorial.ipynb CHANGED
@@ -3,19 +3,108 @@
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "id": "d0bb4874-7f7b-40a9-88ea-922aaed0f3a3",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": [
10
- "\n",
11
- "# +\n",
12
- "from langchain_chroma import Chroma\n",
13
- "from langchain_core.output_parsers import StrOutputParser\n",
14
- "from langchain_core.runnables import RunnablePassthrough\n",
15
- "from langchain_openai import OpenAIEmbeddings\n",
16
- "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
17
- "from langchain_community.document_loaders import PyPDFLoader\n",
18
- "\n",
19
  "## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets\n",
20
  "import streamlit as st\n",
21
  "import os\n",
@@ -29,7 +118,7 @@
29
  },
30
  {
31
  "cell_type": "code",
32
- "execution_count": 34,
33
  "id": "95ed10f3-5339-40cd-bf16-b0854f8b4b91",
34
  "metadata": {},
35
  "outputs": [],
@@ -39,47 +128,26 @@
39
  "import zipfile\n",
40
  "\n",
41
  "def download_and_unzip(url, output_dir):\n",
42
- " \"\"\"\n",
43
- " Downloads a ZIP file from a URL and unzips it to a specified directory.\n",
44
- " \n",
45
- " Args:\n",
46
- " url (str): The URL of the ZIP file.\n",
47
- " output_dir (str): The directory where the ZIP file will be unzipped.\n",
48
- " \"\"\"\n",
49
- " # Download the ZIP file\n",
50
  " response = requests.get(url)\n",
51
  " zip_file_path = os.path.basename(url)\n",
52
- "\n",
53
- " # Save the ZIP file to the current directory\n",
54
  " with open(zip_file_path, 'wb') as f:\n",
55
  " f.write(response.content)\n",
56
- "\n",
57
- " # Unzip the ZIP file\n",
58
  " with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
59
  " zip_ref.extractall(output_dir)\n",
60
- "\n",
61
- " # Remove the ZIP file\n",
62
  " os.remove(zip_file_path)\n",
63
  "\n",
64
- "# Example usage:\n",
65
- "url = \"https://minio.carlboettiger.info/public-data/hwc.zip\"\n",
66
- "output_dir = \"hwc\"\n",
67
- "\n",
68
- "# Create the output directory if it doesn't exist\n",
69
- "if not os.path.exists(output_dir):\n",
70
- " os.makedirs(output_dir)\n",
71
- "\n",
72
- "download_and_unzip(url, output_dir)"
73
  ]
74
  },
75
  {
76
  "cell_type": "code",
77
- "execution_count": 13,
78
  "id": "2fbca6dc-a90b-4dd4-8225-8baac5c6622d",
79
  "metadata": {},
80
  "outputs": [],
81
  "source": [
82
  "import pathlib\n",
 
83
  "\n",
84
  "def pdf_loader(path):\n",
85
  " all_documents = []\n",
@@ -90,12 +158,12 @@
90
  " all_documents.extend(documents)\n",
91
  " return all_documents\n",
92
  "\n",
93
- "docs = pdf_loader('/home/rstudio/data/hwc/')\n"
94
  ]
95
  },
96
  {
97
  "cell_type": "code",
98
- "execution_count": 14,
99
  "id": "c6e99791-8f34-4722-9708-665e409c26bd",
100
  "metadata": {},
101
  "outputs": [],
@@ -115,31 +183,23 @@
115
  },
116
  {
117
  "cell_type": "code",
118
- "execution_count": 23,
119
  "id": "0a8a004d-bb80-42bf-b7e6-15a54e2dd804",
120
  "metadata": {},
121
  "outputs": [],
122
  "source": [
123
  "## Cirrus instead:\n",
 
124
  "embedding = OpenAIEmbeddings(\n",
125
  " model = \"cirrus\",\n",
126
  " api_key = cirrus_key, \n",
127
  " base_url = \"https://llm.cirrus.carlboettiger.info/v1\",\n",
128
- ")\n",
129
- "\n",
130
- "\n",
131
- "\n",
132
- "text = \"A test\"\n",
133
- "\n",
134
- "vectorstore = InMemoryVectorStore.from_texts(\n",
135
- " [text],\n",
136
- " embedding=embedding,\n",
137
- ")\n"
138
  ]
139
  },
140
  {
141
  "cell_type": "code",
142
- "execution_count": 24,
143
  "id": "fd8bcc13-d06d-43dd-9e06-4f29da803133",
144
  "metadata": {},
145
  "outputs": [],
@@ -156,7 +216,7 @@
156
  },
157
  {
158
  "cell_type": "code",
159
- "execution_count": 25,
160
  "id": "7f388394-5da2-4db8-8e48-e10436c8532d",
161
  "metadata": {},
162
  "outputs": [],
@@ -168,7 +228,7 @@
168
  },
169
  {
170
  "cell_type": "code",
171
- "execution_count": 26,
172
  "id": "2bf50abf-5ccd-4de5-9fc4-c9043a66a108",
173
  "metadata": {},
174
  "outputs": [],
@@ -199,7 +259,7 @@
199
  },
200
  {
201
  "cell_type": "code",
202
- "execution_count": 27,
203
  "id": "e15c64e7-0916-4042-8274-870e4fdb1af7",
204
  "metadata": {},
205
  "outputs": [
@@ -227,7 +287,7 @@
227
  },
228
  {
229
  "cell_type": "code",
230
- "execution_count": 28,
231
  "id": "35613607-2c36-4761-a8ea-8c0889530f34",
232
  "metadata": {},
233
  "outputs": [
@@ -256,7 +316,7 @@
256
  },
257
  {
258
  "cell_type": "code",
259
- "execution_count": 29,
260
  "id": "3dfc39f6-86e9-47c3-ab67-08f90ebbb823",
261
  "metadata": {},
262
  "outputs": [
@@ -284,7 +344,7 @@
284
  },
285
  {
286
  "cell_type": "code",
287
- "execution_count": 30,
288
  "id": "56091874-0e41-4b35-be4f-08d8ec6faf56",
289
  "metadata": {},
290
  "outputs": [
@@ -310,7 +370,7 @@
310
  },
311
  {
312
  "cell_type": "code",
313
- "execution_count": 31,
314
  "id": "918dc691-6c66-46b2-8930-01dbeb6f712b",
315
  "metadata": {},
316
  "outputs": [
@@ -336,7 +396,7 @@
336
  },
337
  {
338
  "cell_type": "code",
339
- "execution_count": 32,
340
  "id": "07b9578c-9a89-4874-a34d-30a060ed3407",
341
  "metadata": {},
342
  "outputs": [
@@ -362,7 +422,7 @@
362
  },
363
  {
364
  "cell_type": "code",
365
- "execution_count": 33,
366
  "id": "ba272b88-1622-4d06-9361-7f1e2ca89e73",
367
  "metadata": {},
368
  "outputs": [
@@ -398,7 +458,7 @@
398
  },
399
  {
400
  "cell_type": "code",
401
- "execution_count": 17,
402
  "id": "d4bf2492-6852-43a7-8527-06ee4e9848c0",
403
  "metadata": {},
404
  "outputs": [
@@ -414,155 +474,7 @@
414
  ]
415
  }
416
  ],
417
- "source": [
418
- "import os\n",
419
- "from langchain_community.vectorstores import FAISS\n",
420
- "from langchain_community.vectorstores import Chroma\n",
421
- "from langchain_community.vectorstores import Qdrant\n",
422
- "from qdrant_client import QdrantClient\n",
423
- "from qdrant_client.models import Distance, VectorParams\n",
424
- "import gc\n",
425
- "import torch\n",
426
- "\n",
427
- "# Option 1: FAISS (Facebook AI Similarity Search) - Most memory efficient\n",
428
- "def create_faiss_vectorstore(splits, embedding, persist_directory=\"./faiss_db\", batch_size=100):\n",
429
- " \"\"\"\n",
430
- " Create FAISS vector store with batched processing to minimize GPU RAM usage\n",
431
- " \"\"\"\n",
432
- " os.makedirs(persist_directory, exist_ok=True)\n",
433
- " \n",
434
- " # Process documents in batches to avoid GPU memory overflow\n",
435
- " vectorstore = None\n",
436
- " \n",
437
- " for i in range(0, len(splits), batch_size):\n",
438
- " batch = splits[i:i + batch_size]\n",
439
- " print(f\"Processing batch {i//batch_size + 1}/{(len(splits) + batch_size - 1)//batch_size}\")\n",
440
- " \n",
441
- " if vectorstore is None:\n",
442
- " # Create initial vectorstore with first batch\n",
443
- " vectorstore = FAISS.from_documents(\n",
444
- " documents=batch,\n",
445
- " embedding=embedding\n",
446
- " )\n",
447
- " else:\n",
448
- " # Add subsequent batches to existing vectorstore\n",
449
- " batch_vectorstore = FAISS.from_documents(\n",
450
- " documents=batch,\n",
451
- " embedding=embedding\n",
452
- " )\n",
453
- " vectorstore.merge_from(batch_vectorstore)\n",
454
- " \n",
455
- " # Clean up temporary vectorstore\n",
456
- " del batch_vectorstore\n",
457
- " \n",
458
- " # Force garbage collection and clear GPU cache if using CUDA\n",
459
- " gc.collect()\n",
460
- " if torch.cuda.is_available():\n",
461
- " torch.cuda.empty_cache()\n",
462
- " \n",
463
- " # Save to disk\n",
464
- " vectorstore.save_local(persist_directory)\n",
465
- " print(f\"Vector store saved to {persist_directory}\")\n",
466
- " \n",
467
- " return vectorstore\n",
468
- "\n",
469
- "def load_faiss_vectorstore(embedding, persist_directory=\"./faiss_db\"):\n",
470
- " \"\"\"Load existing FAISS vector store from disk\"\"\"\n",
471
- " return FAISS.load_local(\n",
472
- " persist_directory,\n",
473
- " embedding,\n",
474
- " allow_dangerous_deserialization=True # Only if you trust the source\n",
475
- " )\n",
476
- "\n",
477
- "# Option 2: Chroma - Persistent SQLite-based storage\n",
478
- "def create_chroma_vectorstore(splits, embedding, persist_directory=\"./chroma_db\", batch_size=100):\n",
479
- " \"\"\"\n",
480
- " Create Chroma vector store with batched processing\n",
481
- " \"\"\"\n",
482
- " # Initialize Chroma with persistence\n",
483
- " vectorstore = Chroma(\n",
484
- " persist_directory=persist_directory,\n",
485
- " embedding_function=embedding\n",
486
- " )\n",
487
- " \n",
488
- " # Add documents in batches\n",
489
- " for i in range(0, len(splits), batch_size):\n",
490
- " batch = splits[i:i + batch_size]\n",
491
- " print(f\"Processing batch {i//batch_size + 1}/{(len(splits) + batch_size - 1)//batch_size}\")\n",
492
- " \n",
493
- " vectorstore.add_documents(batch)\n",
494
- " \n",
495
- " # Force garbage collection and clear GPU cache\n",
496
- " gc.collect()\n",
497
- " if torch.cuda.is_available():\n",
498
- " torch.cuda.empty_cache()\n",
499
- " \n",
500
- " # Persist to disk\n",
501
- " vectorstore.persist()\n",
502
- " print(f\"Vector store persisted to {persist_directory}\")\n",
503
- " \n",
504
- " return vectorstore\n",
505
- "\n",
506
- "def load_chroma_vectorstore(embedding, persist_directory=\"./chroma_db\"):\n",
507
- " \"\"\"Load existing Chroma vector store from disk\"\"\"\n",
508
- " return Chroma(\n",
509
- " persist_directory=persist_directory,\n",
510
- " embedding_function=embedding\n",
511
- " )\n",
512
- "\n",
513
- "# Option 3: Qdrant - High-performance vector database\n",
514
- "def create_qdrant_vectorstore(splits, embedding, collection_name=\"documents\", \n",
515
- " path=\"./qdrant_db\", batch_size=100):\n",
516
- " \"\"\"\n",
517
- " Create Qdrant vector store with local file-based storage\n",
518
- " \"\"\"\n",
519
- " # Initialize local Qdrant client\n",
520
- " client = QdrantClient(path=path)\n",
521
- " \n",
522
- " # Get embedding dimension (embed a sample text)\n",
523
- " sample_embedding = embedding.embed_query(\"sample text\")\n",
524
- " embedding_dim = len(sample_embedding)\n",
525
- " \n",
526
- " # Create collection if it doesn't exist\n",
527
- " try:\n",
528
- " client.create_collection(\n",
529
- " collection_name=collection_name,\n",
530
- " vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE)\n",
531
- " )\n",
532
- " except Exception as e:\n",
533
- " print(f\"Collection might already exist: {e}\")\n",
534
- " \n",
535
- " # Create vectorstore\n",
536
- " vectorstore = Qdrant(\n",
537
- " client=client,\n",
538
- " collection_name=collection_name,\n",
539
- " embeddings=embedding\n",
540
- " )\n",
541
- " \n",
542
- " # Add documents in batches\n",
543
- " for i in range(0, len(splits), batch_size):\n",
544
- " batch = splits[i:i + batch_size]\n",
545
- " print(f\"Processing batch {i//batch_size + 1}/{(len(splits) + batch_size - 1)//batch_size}\")\n",
546
- " \n",
547
- " vectorstore.add_documents(batch)\n",
548
- " \n",
549
- " # Force garbage collection and clear GPU cache\n",
550
- " gc.collect()\n",
551
- " if torch.cuda.is_available():\n",
552
- " torch.cuda.empty_cache()\n",
553
- " \n",
554
- " print(f\"Vector store created in {path}\")\n",
555
- " return vectorstore\n",
556
- "\n",
557
- "def load_qdrant_vectorstore(embedding, collection_name=\"documents\", path=\"./qdrant_db\"):\n",
558
- " \"\"\"Load existing Qdrant vector store from disk\"\"\"\n",
559
- " client = QdrantClient(path=path)\n",
560
- " return Qdrant(\n",
561
- " client=client,\n",
562
- " collection_name=collection_name,\n",
563
- " embeddings=embedding\n",
564
- " )\n"
565
- ]
566
  },
567
  {
568
  "cell_type": "code",
@@ -622,7 +534,7 @@
622
  ],
623
  "metadata": {
624
  "kernelspec": {
625
- "display_name": "Python 3 (ipykernel)",
626
  "language": "python",
627
  "name": "python3"
628
  },
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": 1,
6
+ "id": "49d94364",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Requirement already satisfied: streamlit in /opt/conda/lib/python3.12/site-packages (1.45.1)\n",
14
+ "Requirement already satisfied: langchain-community in /opt/conda/lib/python3.12/site-packages (0.3.24)\n",
15
+ "Requirement already satisfied: langchain-openai in /opt/conda/lib/python3.12/site-packages (0.3.18)\n",
16
+ "Requirement already satisfied: langchain-text-splitters in /opt/conda/lib/python3.12/site-packages (0.3.8)\n",
17
+ "Requirement already satisfied: requests in /opt/conda/lib/python3.12/site-packages (2.32.3)\n",
18
+ "Requirement already satisfied: pathlib in /opt/conda/lib/python3.12/site-packages (1.0.1)\n",
19
+ "Requirement already satisfied: pypdf in /opt/conda/lib/python3.12/site-packages (5.5.0)\n",
20
+ "Requirement already satisfied: altair<6,>=4.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (5.5.0)\n",
21
+ "Requirement already satisfied: blinker<2,>=1.5.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (1.9.0)\n",
22
+ "Requirement already satisfied: cachetools<6,>=4.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (5.5.2)\n",
23
+ "Requirement already satisfied: click<9,>=7.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (8.2.0)\n",
24
+ "Requirement already satisfied: numpy<3,>=1.23 in /opt/conda/lib/python3.12/site-packages (from streamlit) (1.26.4)\n",
25
+ "Requirement already satisfied: packaging<25,>=20 in /opt/conda/lib/python3.12/site-packages (from streamlit) (24.2)\n",
26
+ "Requirement already satisfied: pandas<3,>=1.4.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (2.2.3)\n",
27
+ "Requirement already satisfied: pillow<12,>=7.1.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (11.2.1)\n",
28
+ "Requirement already satisfied: protobuf<7,>=3.20 in /opt/conda/lib/python3.12/site-packages (from streamlit) (5.29.3)\n",
29
+ "Requirement already satisfied: pyarrow>=7.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (19.0.1)\n",
30
+ "Requirement already satisfied: tenacity<10,>=8.1.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (9.1.2)\n",
31
+ "Requirement already satisfied: toml<2,>=0.10.1 in /opt/conda/lib/python3.12/site-packages (from streamlit) (0.10.2)\n",
32
+ "Requirement already satisfied: typing-extensions<5,>=4.4.0 in /opt/conda/lib/python3.12/site-packages (from streamlit) (4.13.2)\n",
33
+ "Requirement already satisfied: watchdog<7,>=2.1.5 in /opt/conda/lib/python3.12/site-packages (from streamlit) (6.0.0)\n",
34
+ "Requirement already satisfied: gitpython!=3.1.19,<4,>=3.0.7 in /opt/conda/lib/python3.12/site-packages (from streamlit) (3.1.44)\n",
35
+ "Requirement already satisfied: pydeck<1,>=0.8.0b4 in /opt/conda/lib/python3.12/site-packages (from streamlit) (0.9.1)\n",
36
+ "Requirement already satisfied: tornado<7,>=6.0.3 in /opt/conda/lib/python3.12/site-packages (from streamlit) (6.5)\n",
37
+ "Requirement already satisfied: charset_normalizer<4,>=2 in /opt/conda/lib/python3.12/site-packages (from requests) (3.4.2)\n",
38
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.12/site-packages (from requests) (3.10)\n",
39
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.12/site-packages (from requests) (2.4.0)\n",
40
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.12/site-packages (from requests) (2025.4.26)\n",
41
+ "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.12/site-packages (from altair<6,>=4.0->streamlit) (3.1.6)\n",
42
+ "Requirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.12/site-packages (from altair<6,>=4.0->streamlit) (4.23.0)\n",
43
+ "Requirement already satisfied: narwhals>=1.14.2 in /opt/conda/lib/python3.12/site-packages (from altair<6,>=4.0->streamlit) (1.39.1)\n",
44
+ "Requirement already satisfied: gitdb<5,>=4.0.1 in /opt/conda/lib/python3.12/site-packages (from gitpython!=3.1.19,<4,>=3.0.7->streamlit) (4.0.12)\n",
45
+ "Requirement already satisfied: smmap<6,>=3.0.1 in /opt/conda/lib/python3.12/site-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.19,<4,>=3.0.7->streamlit) (5.0.2)\n",
46
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.12/site-packages (from pandas<3,>=1.4.0->streamlit) (2.9.0.post0)\n",
47
+ "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.12/site-packages (from pandas<3,>=1.4.0->streamlit) (2025.2)\n",
48
+ "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.12/site-packages (from pandas<3,>=1.4.0->streamlit) (2025.2)\n",
49
+ "Requirement already satisfied: langchain-core<1.0.0,>=0.3.59 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (0.3.61)\n",
50
+ "Requirement already satisfied: langchain<1.0.0,>=0.3.25 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (0.3.25)\n",
51
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (2.0.41)\n",
52
+ "Requirement already satisfied: PyYAML>=5.3 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (6.0.2)\n",
53
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (3.11.18)\n",
54
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (0.6.7)\n",
55
+ "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (2.9.1)\n",
56
+ "Requirement already satisfied: langsmith<0.4,>=0.1.125 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (0.2.11)\n",
57
+ "Requirement already satisfied: httpx-sse<1.0.0,>=0.4.0 in /opt/conda/lib/python3.12/site-packages (from langchain-community) (0.4.0)\n",
58
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (2.6.1)\n",
59
+ "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.3.2)\n",
60
+ "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (25.3.0)\n",
61
+ "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.6.0)\n",
62
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (6.4.4)\n",
63
+ "Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (0.3.1)\n",
64
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.12/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain-community) (1.20.0)\n",
65
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /opt/conda/lib/python3.12/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (3.26.1)\n",
66
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /opt/conda/lib/python3.12/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain-community) (0.9.0)\n",
67
+ "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in /opt/conda/lib/python3.12/site-packages (from langchain<1.0.0,>=0.3.25->langchain-community) (2.11.4)\n",
68
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /opt/conda/lib/python3.12/site-packages (from langchain-core<1.0.0,>=0.3.59->langchain-community) (1.33)\n",
69
+ "Requirement already satisfied: jsonpointer>=1.9 in /opt/conda/lib/python3.12/site-packages (from jsonpatch<2.0,>=1.33->langchain-core<1.0.0,>=0.3.59->langchain-community) (3.0.0)\n",
70
+ "Requirement already satisfied: httpx<1,>=0.23.0 in /opt/conda/lib/python3.12/site-packages (from langsmith<0.4,>=0.1.125->langchain-community) (0.28.1)\n",
71
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in /opt/conda/lib/python3.12/site-packages (from langsmith<0.4,>=0.1.125->langchain-community) (3.10.18)\n",
72
+ "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in /opt/conda/lib/python3.12/site-packages (from langsmith<0.4,>=0.1.125->langchain-community) (1.0.0)\n",
73
+ "Requirement already satisfied: anyio in /opt/conda/lib/python3.12/site-packages (from httpx<1,>=0.23.0->langsmith<0.4,>=0.1.125->langchain-community) (4.9.0)\n",
74
+ "Requirement already satisfied: httpcore==1.* in /opt/conda/lib/python3.12/site-packages (from httpx<1,>=0.23.0->langsmith<0.4,>=0.1.125->langchain-community) (1.0.9)\n",
75
+ "Requirement already satisfied: h11>=0.16 in /opt/conda/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.4,>=0.1.125->langchain-community) (0.16.0)\n",
76
+ "Requirement already satisfied: annotated-types>=0.6.0 in /opt/conda/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.7.4->langchain<1.0.0,>=0.3.25->langchain-community) (0.7.0)\n",
77
+ "Requirement already satisfied: pydantic-core==2.33.2 in /opt/conda/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.7.4->langchain<1.0.0,>=0.3.25->langchain-community) (2.33.2)\n",
78
+ "Requirement already satisfied: typing-inspection>=0.4.0 in /opt/conda/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.7.4->langchain<1.0.0,>=0.3.25->langchain-community) (0.4.0)\n",
79
+ "Requirement already satisfied: python-dotenv>=0.21.0 in /opt/conda/lib/python3.12/site-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain-community) (1.1.0)\n",
80
+ "Requirement already satisfied: greenlet>=1 in /opt/conda/lib/python3.12/site-packages (from SQLAlchemy<3,>=1.4->langchain-community) (3.2.2)\n",
81
+ "Requirement already satisfied: mypy_extensions>=0.3.0 in /opt/conda/lib/python3.12/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community) (1.1.0)\n",
82
+ "Requirement already satisfied: openai<2.0.0,>=1.68.2 in /opt/conda/lib/python3.12/site-packages (from langchain-openai) (1.82.0)\n",
83
+ "Requirement already satisfied: tiktoken<1,>=0.7 in /opt/conda/lib/python3.12/site-packages (from langchain-openai) (0.9.0)\n",
84
+ "Requirement already satisfied: distro<2,>=1.7.0 in /opt/conda/lib/python3.12/site-packages (from openai<2.0.0,>=1.68.2->langchain-openai) (1.9.0)\n",
85
+ "Requirement already satisfied: jiter<1,>=0.4.0 in /opt/conda/lib/python3.12/site-packages (from openai<2.0.0,>=1.68.2->langchain-openai) (0.10.0)\n",
86
+ "Requirement already satisfied: sniffio in /opt/conda/lib/python3.12/site-packages (from openai<2.0.0,>=1.68.2->langchain-openai) (1.3.1)\n",
87
+ "Requirement already satisfied: tqdm>4 in /opt/conda/lib/python3.12/site-packages (from openai<2.0.0,>=1.68.2->langchain-openai) (4.67.1)\n",
88
+ "Requirement already satisfied: regex>=2022.1.18 in /opt/conda/lib/python3.12/site-packages (from tiktoken<1,>=0.7->langchain-openai) (2024.11.6)\n",
89
+ "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.12/site-packages (from jinja2->altair<6,>=4.0->streamlit) (3.0.2)\n",
90
+ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.12/site-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (2025.4.1)\n",
91
+ "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.12/site-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.36.2)\n",
92
+ "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.12/site-packages (from jsonschema>=3.0->altair<6,>=4.0->streamlit) (0.25.0)\n",
93
+ "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas<3,>=1.4.0->streamlit) (1.17.0)\n"
94
+ ]
95
+ }
96
+ ],
97
+ "source": [
98
+ "!pip install streamlit langchain-community langchain-openai langchain-text-splitters requests pathlib pypdf"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 2,
104
  "id": "d0bb4874-7f7b-40a9-88ea-922aaed0f3a3",
105
  "metadata": {},
106
  "outputs": [],
107
  "source": [
 
 
 
 
 
 
 
 
 
108
  "## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets\n",
109
  "import streamlit as st\n",
110
  "import os\n",
 
118
  },
119
  {
120
  "cell_type": "code",
121
+ "execution_count": 3,
122
  "id": "95ed10f3-5339-40cd-bf16-b0854f8b4b91",
123
  "metadata": {},
124
  "outputs": [],
 
128
  "import zipfile\n",
129
  "\n",
130
  "def download_and_unzip(url, output_dir):\n",
 
 
 
 
 
 
 
 
131
  " response = requests.get(url)\n",
132
  " zip_file_path = os.path.basename(url)\n",
 
 
133
  " with open(zip_file_path, 'wb') as f:\n",
134
  " f.write(response.content)\n",
 
 
135
  " with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
136
  " zip_ref.extractall(output_dir)\n",
 
 
137
  " os.remove(zip_file_path)\n",
138
  "\n",
139
+ "download_and_unzip(\"https://minio.carlboettiger.info/public-data/hwc.zip\", \"hwc\")"
 
 
 
 
 
 
 
 
140
  ]
141
  },
142
  {
143
  "cell_type": "code",
144
+ "execution_count": 5,
145
  "id": "2fbca6dc-a90b-4dd4-8225-8baac5c6622d",
146
  "metadata": {},
147
  "outputs": [],
148
  "source": [
149
  "import pathlib\n",
150
+ "from langchain_community.document_loaders import PyPDFLoader\n",
151
  "\n",
152
  "def pdf_loader(path):\n",
153
  " all_documents = []\n",
 
158
  " all_documents.extend(documents)\n",
159
  " return all_documents\n",
160
  "\n",
161
+ "docs = pdf_loader('hwc/')\n"
162
  ]
163
  },
164
  {
165
  "cell_type": "code",
166
+ "execution_count": 6,
167
  "id": "c6e99791-8f34-4722-9708-665e409c26bd",
168
  "metadata": {},
169
  "outputs": [],
 
183
  },
184
  {
185
  "cell_type": "code",
186
+ "execution_count": 7,
187
  "id": "0a8a004d-bb80-42bf-b7e6-15a54e2dd804",
188
  "metadata": {},
189
  "outputs": [],
190
  "source": [
191
  "## Cirrus instead:\n",
192
+ "from langchain_openai import OpenAIEmbeddings\n",
193
  "embedding = OpenAIEmbeddings(\n",
194
  " model = \"cirrus\",\n",
195
  " api_key = cirrus_key, \n",
196
  " base_url = \"https://llm.cirrus.carlboettiger.info/v1\",\n",
197
+ ")"
 
 
 
 
 
 
 
 
 
198
  ]
199
  },
200
  {
201
  "cell_type": "code",
202
+ "execution_count": 8,
203
  "id": "fd8bcc13-d06d-43dd-9e06-4f29da803133",
204
  "metadata": {},
205
  "outputs": [],
 
216
  },
217
  {
218
  "cell_type": "code",
219
+ "execution_count": null,
220
  "id": "7f388394-5da2-4db8-8e48-e10436c8532d",
221
  "metadata": {},
222
  "outputs": [],
 
228
  },
229
  {
230
  "cell_type": "code",
231
+ "execution_count": null,
232
  "id": "2bf50abf-5ccd-4de5-9fc4-c9043a66a108",
233
  "metadata": {},
234
  "outputs": [],
 
259
  },
260
  {
261
  "cell_type": "code",
262
+ "execution_count": null,
263
  "id": "e15c64e7-0916-4042-8274-870e4fdb1af7",
264
  "metadata": {},
265
  "outputs": [
 
287
  },
288
  {
289
  "cell_type": "code",
290
+ "execution_count": null,
291
  "id": "35613607-2c36-4761-a8ea-8c0889530f34",
292
  "metadata": {},
293
  "outputs": [
 
316
  },
317
  {
318
  "cell_type": "code",
319
+ "execution_count": null,
320
  "id": "3dfc39f6-86e9-47c3-ab67-08f90ebbb823",
321
  "metadata": {},
322
  "outputs": [
 
344
  },
345
  {
346
  "cell_type": "code",
347
+ "execution_count": null,
348
  "id": "56091874-0e41-4b35-be4f-08d8ec6faf56",
349
  "metadata": {},
350
  "outputs": [
 
370
  },
371
  {
372
  "cell_type": "code",
373
+ "execution_count": null,
374
  "id": "918dc691-6c66-46b2-8930-01dbeb6f712b",
375
  "metadata": {},
376
  "outputs": [
 
396
  },
397
  {
398
  "cell_type": "code",
399
+ "execution_count": null,
400
  "id": "07b9578c-9a89-4874-a34d-30a060ed3407",
401
  "metadata": {},
402
  "outputs": [
 
422
  },
423
  {
424
  "cell_type": "code",
425
+ "execution_count": null,
426
  "id": "ba272b88-1622-4d06-9361-7f1e2ca89e73",
427
  "metadata": {},
428
  "outputs": [
 
458
  },
459
  {
460
  "cell_type": "code",
461
+ "execution_count": null,
462
  "id": "d4bf2492-6852-43a7-8527-06ee4e9848c0",
463
  "metadata": {},
464
  "outputs": [
 
474
  ]
475
  }
476
  ],
477
+ "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  },
479
  {
480
  "cell_type": "code",
 
534
  ],
535
  "metadata": {
536
  "kernelspec": {
537
+ "display_name": "base",
538
  "language": "python",
539
  "name": "python3"
540
  },