import pandas as pd import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download('stopwords') nltk.download('wordnet') stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def clean_text(text): text = text.lower() text = re.sub(r'[^a-zA-Z\s]', '', text) words = text.split() words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words] return ' '.join(words) df = pd.read_csv('data.csv') # Replace with your actual CSV file df['cleaned_text'] = df['purpose_text'].apply(clean_text) df.to_csv('cleaned_data.csv', index=False)