cboettig commited on
Commit
358dcca
·
1 Parent(s): c29aad8

:see_no_evil:

Browse files
Files changed (5) hide show
  1. .gitignore +3 -0
  2. Dockerfile +21 -0
  3. README.md +20 -1
  4. requirements.txt +14 -0
  5. src/app.py +124 -0
.gitignore CHANGED
@@ -172,3 +172,6 @@ cython_debug/
172
 
173
  # PyPI configuration file
174
  .pypirc
 
 
 
 
172
 
173
  # PyPI configuration file
174
  .pypirc
175
+
176
+ *.pdf
177
+
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ software-properties-common \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ COPY requirements.txt ./
13
+ COPY src/ ./src/
14
+
15
+ RUN pip3 install -r requirements.txt
16
+
17
+ EXPOSE 8501
18
+
19
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
+
21
+ ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md CHANGED
@@ -1 +1,20 @@
1
- # hwc-llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hwc Llm
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: Human-Wildlife Conflict LLM
12
+ license: bsd
13
+ ---
14
+
15
+ # Welcome to Streamlit!
16
+
17
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
+
19
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
+ forums](https://discuss.streamlit.io).
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ langchain-chroma
3
+ bs4
4
+ langchain
5
+ langchain-chroma
6
+ langchain-community
7
+ langchain-core
8
+ langchain-core
9
+ langchain_openai
10
+ langchain-text-splitters
11
+ os
12
+ requests
13
+ zipfile
14
+ pathlib
src/app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+
4
+ ## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets
5
+ import os
6
+ api_key = os.getenv("LITELLM_KEY")
7
+ if api_key is None:
8
+ api_key = st.secrets["LITELLM_KEY"]
9
+ cirrus_key = os.getenv("CIRRUS_KEY")
10
+ if cirrus_key is None:
11
+ cirrus_key = st.secrets["CIRRUS_KEY"]
12
+
13
+ st.title("HWC LLM Testing")
14
+
15
+
16
+ '''
17
+ (Demo will take a while to load first while processing all data! Will be pre-processed in future...)
18
+ '''
19
+
20
+ # +
21
+ import bs4
22
+ from langchain import hub
23
+ from langchain_chroma import Chroma
24
+ from langchain_community.document_loaders import WebBaseLoader
25
+ from langchain_core.output_parsers import StrOutputParser
26
+ from langchain_core.runnables import RunnablePassthrough
27
+ from langchain_openai import OpenAIEmbeddings
28
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
29
+
30
+ import os
31
+ import requests
32
+ import zipfile
33
+
34
+ def download_and_unzip(url, output_dir):
35
+ if not os.path.exists(output_dir):
36
+ os.makedirs(output_dir)
37
+ response = requests.get(url)
38
+ zip_file_path = os.path.basename(url)
39
+ with open(zip_file_path, 'wb') as f:
40
+ f.write(response.content)
41
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
42
+ zip_ref.extractall(output_dir)
43
+ os.remove(zip_file_path)
44
+
45
+ url = "https://minio.carlboettiger.info/public-data/hwc.zip"
46
+ output_dir = "hwc"
47
+ download_and_unzip(url, "hwc")
48
+
49
+ import pathlib
50
+ @st.cache_data
51
+ def pdf_loader(path):
52
+ all_documents = []
53
+ docs_dir = pathlib.Path(path)
54
+ for file in docs_dir.iterdir():
55
+ loader = PyPDFLoader(file)
56
+ documents = loader.load()
57
+ all_documents.extend(documents)
58
+ return all_documents
59
+
60
+ docs = pdf_loader('hwc/')
61
+
62
+
63
+
64
+ # Set up the language model
65
+ from langchain_openai import ChatOpenAI
66
+ llm = ChatOpenAI(model = "llama3", api_key = api_key, base_url = "https://llm.nrp-nautilus.io", temperature=0)
67
+ ## Cirrus instead:
68
+ embedding = OpenAIEmbeddings(
69
+ model = "cirrus",
70
+ api_key = cirrus_key,
71
+ base_url = "https://llm.cirrus.carlboettiger.info/v1",
72
+ )
73
+
74
+
75
+
76
+ # Build a retrival agent
77
+ from langchain_core.vectorstores import InMemoryVectorStore
78
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
79
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
80
+ splits = text_splitter.split_documents(docs)
81
+ vectorstore = InMemoryVectorStore.from_documents(documents=splits, embedding=embedding)
82
+ retriever = vectorstore.as_retriever()
83
+
84
+ from langchain.chains import create_retrieval_chain
85
+ from langchain.chains.combine_documents import create_stuff_documents_chain
86
+ from langchain_core.prompts import ChatPromptTemplate
87
+ system_prompt = (
88
+ "You are an assistant for question-answering tasks. "
89
+ "Use the following pieces of retrieved context to answer "
90
+ "the question. If you don't know the answer, say that you "
91
+ "don't know. Use three sentences maximum and keep the "
92
+ "answer concise."
93
+ "\n\n"
94
+ "{context}"
95
+ )
96
+ prompt = ChatPromptTemplate.from_messages(
97
+ [
98
+ ("system", system_prompt),
99
+ ("human", "{input}"),
100
+ ]
101
+ )
102
+ question_answer_chain = create_stuff_documents_chain(llm, prompt)
103
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
104
+
105
+
106
+ # Place agent inside a streamlit application:
107
+
108
+ if prompt := st.chat_input("What is the goal of CA 30x30?"):
109
+ with st.chat_message("user"):
110
+ st.markdown(prompt)
111
+
112
+ with st.chat_message("assistant"):
113
+ results = rag_chain.invoke({"input": prompt})
114
+ st.write(results['answer'])
115
+
116
+ with st.expander("See context matched"):
117
+ st.write(results['context'][0].page_content)
118
+ st.write(results['context'][0].metadata)
119
+
120
+
121
+ # adapt for memory / multi-question interaction with:
122
+ # https://python.langchain.com/docs/tutorials/qa_chat_history/
123
+
124
+ # Also see structured outputs.