File size: 8,876 Bytes
ebd6305 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"id": "ace57031",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 0 0 ... 0 0 0]\n",
" [0 0 0 ... 0 0 0]\n",
" [0 0 0 ... 0 0 0]\n",
" ...\n",
" [0 0 0 ... 0 0 0]\n",
" [0 0 0 ... 0 0 0]\n",
" [0 0 0 ... 0 0 0]]\n"
]
},
{
"ename": "ValueError",
"evalue": "empty vocabulary; perhaps the documents only contain stop words",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_10290/970796795.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mvectorizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTfidfVectorizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvectorizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquestions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m 2077\u001b[0m \u001b[0msublinear_tf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msublinear_tf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2078\u001b[0m )\n\u001b[0;32m-> 2079\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2080\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tfidf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2081\u001b[0m \u001b[0;31m# X is already a transformed view of raw_documents so\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m 1336\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1337\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1338\u001b[0;31m \u001b[0mvocabulary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_count_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfixed_vocabulary_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1340\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36m_count_vocab\u001b[0;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[1;32m 1226\u001b[0m \u001b[0mvocabulary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocabulary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1227\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mvocabulary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1228\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1229\u001b[0m \u001b[0;34m\"empty vocabulary; perhaps the documents only contain stop words\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1230\u001b[0m )\n",
"\u001b[0;31mValueError\u001b[0m: empty vocabulary; perhaps the documents only contain stop words"
]
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score\n",
"import pickle\n",
"import nltk\n",
"import numpy as np\n",
"from nltk.stem.lancaster import LancasterStemmer\n",
"# Step 1: Collect and preprocess data\n",
"# Get all the questions from Questions column and responses from Questions column in the dataset data.csv\n",
"# questions = data[\"Questions\"].tolist()\n",
"# responses = data[\"Responses\"].tolist()\n",
"questions = []\n",
"responses = []\n",
"q_id = []\n",
"stemmer = LancasterStemmer()\n",
"# with open(\"data.csv\", \"r\") as f:\n",
"with open(\"data.pickle\", \"rb\") as f:\n",
" # read the file\n",
" words, labels, training, output = pickle.load(f)\n",
"\n",
" # get the questions and responses\n",
" # for line in f:\n",
" # array = line.split(\",\") \n",
" # try:\n",
" # question = array[1]\n",
" # response = array[2]\n",
" # question_id = array[0]\n",
" # questions.append(question)\n",
" # responses.append(response)\n",
" # q_id.append(question_id)\n",
" # except:\n",
" # pass\n",
"\n",
"print(labels)\n",
"vectorizer = TfidfVectorizer()\n",
"X = vectorizer.fit_transform(questions)\n",
"y = responses\n",
"\n",
"# Step 2: Split data into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
"\n",
"# Step 3: Choose a machine learning algorithm\n",
"model = LogisticRegression()\n",
"\n",
"# Step 4: Train the model\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Step 5: Evaluate the model\n",
"y_pred = model.predict(X_test)\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(\"Accuracy:\", accuracy)\n",
"\n",
"def bag_of_words(s, words):\n",
" bag = [0 for _ in range(len(words))]\n",
"\n",
" s_words = nltk.word_tokenize(s)\n",
" s_words = [stemmer.stem(word.lower()) for word in s_words]\n",
"\n",
" for se in s_words:\n",
" for i, w in enumerate(words):\n",
" if w == se:\n",
" bag[i] = 1\n",
" \n",
" return np.array(bag)\n",
"\n",
"# Step 6: Use the model to make predictions\n",
"new_question = \"how are you\"\n",
"# new_question_vector = vectorizer.transform([new_question])\n",
"# prediction = model.predict(new_question_vector)\n",
"\n",
"predict = model.predict([bag_of_words(new_question.lower(), words)])\n",
"# print(\"Prediction:\", prediction)\n",
"print(\"Prediction:\", predict)\n",
"\n",
"\n",
"# Step 7: Save the model\n",
"# import pickle\n",
"# pickle.dump(model, open(\"model.pkl\", \"wb\"))\n",
"# pickle.dump(vectorizer, open(\"vectorizer.pkl\", \"wb\"))\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
},
"vscode": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|