{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Read in documents using LangChain's loaders\n", "# Take everything in all the sub-folders of our knowledgebase\n", "\n", "import glob\n", "import os\n", "\n", "# imports for langchain, plotly and Chroma\n", "\n", "from langchain.document_loaders import DirectoryLoader, TextLoader\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.schema import Document\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.chat_models import ChatOpenAI\n", "from langchain_chroma import Chroma\n", "import matplotlib.pyplot as plt\n", "from sklearn.manifold import TSNE\n", "import numpy as np\n", "import plotly.graph_objects as go\n", "from langchain.memory import ConversationBufferMemory\n", "from langchain.chains import ConversationalRetrievalChain\n", "from langchain.embeddings import HuggingFaceEmbeddings" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of chunks: 6800\n", "Document types found: {'academic_calendar', 'admissions', 'research', 'tuition', 'about', 'resources', 'contact', 'policies', 'academics', 'sports', 'scholarships', 'financial_aid', 'events', 'exchange', 'campus', 'student_support', 'news'}\n" ] } ], "source": [ "folders = glob.glob(\"usiu-knowledge-base/*\")\n", "\n", "def add_metadata(doc, doc_type):\n", " doc.metadata[\"doc_type\"] = doc_type\n", " return doc\n", "\n", "# With thanks to CG and Jon R, students on the course, for this fix needed for some users \n", "text_loader_kwargs = {'encoding': 'utf-8'}\n", "# If that doesn't work, some Windows users might need to uncomment the next line instead\n", "# text_loader_kwargs={'autodetect_encoding': True}\n", "\n", "documents = []\n", "for folder in folders:\n", " doc_type = os.path.basename(folder)\n", " loader = DirectoryLoader(folder, glob=\"**/*.md\", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)\n", " folder_docs = loader.load()\n", " documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])\n", "\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", "chunks = text_splitter.split_documents(documents)\n", "\n", "print(f\"Total number of chunks: {len(chunks)}\")\n", "print(f\"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'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'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load environment variables in a file called .env\n", "\n", "from dotenv import load_dotenv\n", "\n", "\n", "load_dotenv()\n", "# os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')\n", "os.getenv('OPENAI_API_KEY')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6800\n", "[Document(metadata={'doc_type': 'research', 'source': 'usiu-knowledge-base/research/20250330031216_apply_now_admission_requirements.md'}, page_content='# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/'), Document(metadata={'doc_type': 'tuition', 'source': 'usiu-knowledge-base/tuition/20250330203921_apply_now_admission_requirements.md'}, page_content='# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/'), Document(metadata={'doc_type': 'sports', 'source': 'usiu-knowledge-base/sports/20250330090137_apply_now_admission_requirements.md'}, page_content='# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/'), Document(metadata={'doc_type': 'admissions', 'source': 'usiu-knowledge-base/admissions/20250330003916_apply_now_admission_requirements.md'}, page_content='# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/'), Document(metadata={'doc_type': 'tuition', 'source': 'usiu-knowledge-base/tuition/20250330205539_apply_now_doctoral_admission_requirements.md'}, page_content='# Doctoral Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/doctoral-admission-requirements/'), Document(metadata={'doc_type': 'student_support', 'source': 'usiu-knowledge-base/student_support/20250330045635_apply_now_doctoral_admission_requirements.md'}, page_content='# Doctoral Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/doctoral-admission-requirements/'), Document(metadata={'doc_type': 'admissions', 'source': 'usiu-knowledge-base/admissions/20250330004404_apply_now_doctoral_admission_requirements.md'}, page_content='# Doctoral Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/doctoral-admission-requirements/'), Document(metadata={'doc_type': 'events', 'source': 'usiu-knowledge-base/events/20250330133137_apply_now_home.md'}, page_content='# USIU-Africa URL: https://www.usiu.ac.ke/apply-now/home/'), Document(metadata={'doc_type': 'resources', 'source': 'usiu-knowledge-base/resources/20250330124605_apply_now_home.md'}, page_content='# USIU-Africa URL: https://www.usiu.ac.ke/apply-now/home/'), Document(metadata={'doc_type': 'scholarships', 'source': 'usiu-knowledge-base/scholarships/20250331003106_apply_now_home.md'}, page_content='# USIU-Africa URL: https://www.usiu.ac.ke/apply-now/home/')]\n", "Retrieved Documents: 10\n", "Retrieved Document Content:\n", "# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# Doctoral Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/doctoral-admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# Doctoral Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/doctoral-admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# Doctoral Admission Requirements - USIU-Africa URL: https://www.usiu.ac.ke/apply-now/doctoral-admission-requirements/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# USIU-Africa URL: https://www.usiu.ac.ke/apply-now/home/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# USIU-Africa URL: https://www.usiu.ac.ke/apply-now/home/\n", "--------------------------------------------------\n", "Retrieved Document Content:\n", "# USIU-Africa URL: https://www.usiu.ac.ke/apply-now/home/\n", "--------------------------------------------------\n" ] } ], "source": [ "# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk\n", "# Chroma is a popular open source Vector Database based on SQLLite\n", "\n", "# If you would rather use the free Vector Embeddings from HuggingFace sentence-transformers\n", "# Then replace embeddings = OpenAIEmbeddings()\n", "# with:\n", "# from langchain.embeddings import HuggingFaceEmbeddings\n", "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "\n", "embeddings = OpenAIEmbeddings()\n", "\n", "db_name = \"./vector_services/usiu_vector_db\"\n", "\n", "from langchain.vectorstores import Chroma\n", "\n", "# Retrieve a document using the VectorStore\n", "vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)\n", "\n", "# Example query\n", "query = \"How do I get admitted at USIU-Africa?\"\n", "\n", "# Perform a similarity search to find the most relevant documents\n", "docs = vectorstore.similarity_search(query, k=10)\n", "print(docs)\n", "\n", "print(f\"Retrieved Documents: {len(docs)}\")\n", "# Print the retrieved document contents.\n", "for doc in docs:\n", " print(\"Retrieved Document Content:\")\n", " print(doc.page_content)\n", " print(\"-\" * 50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from configs.config import GPT4O_MODEL as MODEL\n", "\n", "\n", "# create a new Chat with OpenAI\n", "llm = ChatOpenAI(temperature=0.7, model_name=MODEL)\n", "\n", "# Alternative - if you'd like to use Ollama locally, uncomment this line instead\n", "# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')\n", "\n", "# set up the conversation memory for the chat\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "\n", "# the retriever is an abstraction over the VectorStore that will be used during RAG\n", "retriever = vectorstore.as_retriever()\n", "\n", "# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory\n", "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "result = conversation_chain.invoke({\"question\": query})\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# set up a new conversation memory for the chat\n", "memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)\n", "\n", "# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory\n", "conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Wrapping that in a function\n", "\n", "def chat(question, history):\n", " result = conversation_chain.invoke({\"question\": question})\n", " return result[\"answer\"]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/x4/dd2xz8_d4fjbb7kzdq5sp8_40000gn/T/ipykernel_56308/3678441278.py:129: UserWarning: You have not specified a value for the `type` parameter. Defaulting to the 'tuples' format for chatbot messages, but this is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style dictionaries with 'role' and 'content' keys.\n", " chatbot = gr.Chatbot(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loaded vector store with 0 documents from './usiu_vector_db'.\n", "* Running on local URL: http://127.0.0.1:7881\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "