bank_transaction / preprocess.py
leynessa's picture
Upload 8 files
ff52cdd verified
raw
history blame contribute delete
657 Bytes
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
text = text.lower()
text = re.sub(r'[^a-zA-Z\s]', '', text)
words = text.split()
words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
return ' '.join(words)
df = pd.read_csv('data.csv') # Replace with your actual CSV file
df['cleaned_text'] = df['purpose_text'].apply(clean_text)
df.to_csv('cleaned_data.csv', index=False)