import streamlit as st from GoogleNews import GoogleNews import pandas as pd import numpy as np import spacy import gensim import string import re import sklearn from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score from sklearn.metrics.pairwise import cosine_similarity nlp = spacy.load("spacy.aravec.model") #--------------------------------------------------------------------------------------------------------------- #---------------------------------------------- Side bar ------------------------------------------------------ #--------------------------------------------------------------------------------------------------------------- st.sidebar.markdown('مواقع اخباريه معتمده ') st.sidebar.markdown("[العربية](https://www.alarabiya.net/)") st.sidebar.markdown("[الجزيرة نت](https://www.aljazeera.net/news/)") st.sidebar.markdown("[وكالة الانباء الكويتية](https://www.kuna.net.kw/Default.aspx?language=ar)") #--------------------------------------------------------------------------------------------------------------- st.write(""" Arabic headline news detection """) tx = st.text_input (''' الرجاء ادخال العنوان المراد التاكد من صحته ''') #--------------------------------------------------------------------------------------------------------------- #----------------------------------------Pre-proccessing functions---------------------------------------------- #--------------------------------------------------------------------------------------------------------------- def clean_str(text): search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','"','?','؟','!'] replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! '] #remove tashkeel p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') text = re.sub(p_tashkeel,"", text) #remove longation p_longation = re.compile(r'(.)\1+') subst = r"\1\1" text = re.sub(p_longation, subst, text) text = text.replace('وو', 'و') text = text.replace('يي', 'ي') text = text.replace('اا', 'ا') for i in range(0, len(search)): text = text.replace(search[i], replace[i]) #trim text = text.strip() return text def split_hashtag_to_words(tag): tag = tag.replace('#','') tags = tag.split('_') if len(tags) > 1 : return tags pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") return pattern.findall(tag) def clean_hashtag(text): words = text.split() text = list() for word in words: if is_hashtag(word): text.extend(extract_hashtag(word)) else: text.append(word) return " ".join(text) def is_hashtag(word): if word.startswith("#"): return True else: return False def extract_hashtag(text): hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")]) word_list = [] for word in hash_list : word_list.extend(split_hashtag_to_words(word)) return word_list # Define the preprocessing Class class Preprocessor: def __init__(self, tokenizer, **cfg): self.tokenizer = tokenizer def __call__(self, text): preprocessed = clean_str(text) return self.tokenizer(preprocessed) #--------------------------------------------------------------------------------------------------------------- #----------------------------------------- END OF PRE-PROCESSING------------------------------------------------ #--------------------------------------------------------------------------------------------------------------- # Apply the `Preprocessor` Class nlp.tokenizer = Preprocessor(nlp.tokenizer) if len(tx) != 0: googlenews = GoogleNews(lang='ar') googlenews.clear() f =0 Prediction ='' top_similar_ind ='' top_similar_news ='' medium ='' top_similar_ind2 ='' tp_desc ='' st.markdown(f"Searching for: { tx }") st.markdown(f"ــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــــ") tx = clean_hashtag(tx) tx = clean_str(tx) googlenews.search(tx) result = googlenews.page_at(1) googlenews.clear() if len(result) == 0: Prediction ='الخبر زائف' top_similar_news ='لا يوجد اخبار مماثله' medium ='لا يوجد مصدر' tp_desc ='لا يوجد وصف' else: result_text = {"Text":[]} #google search for i in range(len(result)): title =result[i]['title'] result_text['Text'].append(title) result_text2 = {"Text":[]} #google search for i in range(len(result)): desc =result[i]['desc'] result_text2['Text'].append(desc) result_text = pd.DataFrame(result_text) result_text2 = pd.DataFrame(result_text2) data = pd.DataFrame() data['Text2'] = result_text['Text'].copy() data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx))) sg300top = data['Text2'].max(axis = 0) top_similar_ind = np.argmax(data['Text2']) top_similar_news = result[top_similar_ind]['title'] descr = result[top_similar_ind]['desc'] medium = result[top_similar_ind]['media'] date = result[top_similar_ind]['date'] link = result[top_similar_ind]['link'] data['Text3'] = result_text2['Text'].copy() data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx))) sg300top2 = data['Text3'].max(axis = 0) top_similar_ind2 = np.argmax(data['Text3']) tp_desc = result[top_similar_ind2]['desc'] if sg300top >= .85 or sg300top2 >= .85 : Prediction ='الخبر صحيح' else: Prediction =' الخبر زائف' st.markdown(f"System Prediction : { Prediction }") st.markdown(f"الخبر المماثل: { top_similar_news }") st.markdown(f"") st.markdown(f"تاريخ الخبر: { date }") st.markdown(f"") st.markdown(f"التفصيل: { descr }") st.markdown(f"") st.markdown(f"المصدر: { medium }") st.markdown(f"") st.markdown(f"رابط الخبر: { link }") #st.markdown(f"Searching for: { tx }")