Spaces:

adnaan05
/

TruthCheck

Running

App Files Files Community

TruthCheck / src /data /preprocessor.py

adnaan05

Initial commit for Hugging Face Space

469c254 4 months ago

raw

history blame

3.23 kB

	import re
	import emoji
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from textblob import TextBlob
	from typing import List, Union
	import pandas as pd

	class TextPreprocessor:
	def __init__(self):
	# Download required NLTK data
	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	self.stop_words = set(stopwords.words('english'))
	self.lemmatizer = WordNetLemmatizer()

	def remove_urls(self, text: str) -> str:
	"""Remove URLs from text."""
	url_pattern = re.compile(r'https?://\S+\|www\.\S+')
	return url_pattern.sub('', text)

	def remove_emojis(self, text: str) -> str:
	"""Remove emojis from text."""
	return emoji.replace_emoji(text, replace='')

	def remove_special_chars(self, text: str) -> str:
	"""Remove special characters and numbers."""
	return re.sub(r'[^a-zA-Z\s]', '', text)

	def remove_extra_spaces(self, text: str) -> str:
	"""Remove extra spaces."""
	return re.sub(r'\s+', ' ', text).strip()

	def lemmatize_text(self, text: str) -> str:
	"""Lemmatize text."""
	# Simple word tokenization using split
	tokens = text.split()
	return ' '.join([self.lemmatizer.lemmatize(token) for token in tokens])

	def remove_stopwords(self, text: str) -> str:
	"""Remove stopwords from text."""
	# Simple word tokenization using split
	tokens = text.split()
	return ' '.join([token for token in tokens if token.lower() not in self.stop_words])

	def correct_spelling(self, text: str) -> str:
	"""Correct spelling in text."""
	return str(TextBlob(text).correct())

	def preprocess_text(self, text: str,
	remove_urls: bool = True,
	remove_emojis: bool = True,
	remove_special_chars: bool = True,
	remove_stopwords: bool = True,
	lemmatize: bool = True,
	correct_spelling: bool = False) -> str:
	"""Apply all preprocessing steps to text."""
	if not isinstance(text, str):
	return ""

	text = text.lower()

	if remove_urls:
	text = self.remove_urls(text)
	if remove_emojis:
	text = self.remove_emojis(text)
	if remove_special_chars:
	text = self.remove_special_chars(text)
	if remove_stopwords:
	text = self.remove_stopwords(text)
	if lemmatize:
	text = self.lemmatize_text(text)
	if correct_spelling:
	text = self.correct_spelling(text)

	text = self.remove_extra_spaces(text)
	return text

	def preprocess_dataframe(self, df: pd.DataFrame,
	text_column: str,
	**kwargs) -> pd.DataFrame:
	"""Preprocess text column in a dataframe."""
	df = df.copy()
	df[text_column] = df[text_column].apply(
	lambda x: self.preprocess_text(x, **kwargs)
	)
	return df