100rabhsah commited on
Commit
3269b9d
·
1 Parent(s): b0bc9cf

app.py analyse function fix 3

Browse files
Files changed (2) hide show
  1. src/app.py +8 -0
  2. src/data/preprocessor.py +4 -2
src/app.py CHANGED
@@ -18,6 +18,14 @@ try:
18
  nltk.data.find('corpora/stopwords')
19
  except LookupError:
20
  nltk.download('stopwords')
 
 
 
 
 
 
 
 
21
 
22
  # Add project root to Python path
23
  project_root = Path(__file__).parent.parent
 
18
  nltk.data.find('corpora/stopwords')
19
  except LookupError:
20
  nltk.download('stopwords')
21
+ try:
22
+ nltk.data.find('tokenizers/punkt_tab')
23
+ except LookupError:
24
+ nltk.download('punkt_tab')
25
+ try:
26
+ nltk.data.find('corpora/wordnet')
27
+ except LookupError:
28
+ nltk.download('wordnet')
29
 
30
  # Add project root to Python path
31
  project_root = Path(__file__).parent.parent
src/data/preprocessor.py CHANGED
@@ -37,12 +37,14 @@ class TextPreprocessor:
37
 
38
  def lemmatize_text(self, text: str) -> str:
39
  """Lemmatize text."""
40
- tokens = word_tokenize(text)
 
41
  return ' '.join([self.lemmatizer.lemmatize(token) for token in tokens])
42
 
43
  def remove_stopwords(self, text: str) -> str:
44
  """Remove stopwords from text."""
45
- tokens = word_tokenize(text)
 
46
  return ' '.join([token for token in tokens if token.lower() not in self.stop_words])
47
 
48
  def correct_spelling(self, text: str) -> str:
 
37
 
38
  def lemmatize_text(self, text: str) -> str:
39
  """Lemmatize text."""
40
+ # Simple word tokenization using split
41
+ tokens = text.split()
42
  return ' '.join([self.lemmatizer.lemmatize(token) for token in tokens])
43
 
44
  def remove_stopwords(self, text: str) -> str:
45
  """Remove stopwords from text."""
46
+ # Simple word tokenization using split
47
+ tokens = text.split()
48
  return ' '.join([token for token in tokens if token.lower() not in self.stop_words])
49
 
50
  def correct_spelling(self, text: str) -> str: