|
import streamlit as st |
|
from GoogleNews import GoogleNews |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import spacy |
|
import gensim |
|
import string |
|
import re |
|
|
|
import sklearn |
|
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, f1_score |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
nlp = spacy.load("spacy.aravec.model") |
|
|
|
|
|
|
|
st.sidebar.markdown('ู
ูุงูุน ุงุฎุจุงุฑูู ู
ุนุชู
ุฏู ') |
|
st.sidebar.markdown("[ุงูุนุฑุจูุฉ](https://www.alarabiya.net/)") |
|
st.sidebar.markdown("[ุงูุฌุฒูุฑุฉ ูุช](https://www.aljazeera.net/news/)") |
|
st.sidebar.markdown("[ููุงูุฉ ุงูุงูุจุงุก ุงููููุชูุฉ](https://www.kuna.net.kw/Default.aspx?language=ar)") |
|
|
|
|
|
|
|
st.write(""" |
|
Arabic headline news detection |
|
""") |
|
|
|
tx = st.text_input (''' ุงูุฑุฌุงุก ุงุฏุฎุงู ุงูุนููุงู ุงูู
ุฑุงุฏ ุงูุชุงูุฏ ู
ู ุตุญุชู ''') |
|
|
|
|
|
|
|
|
|
def clean_str(text): |
|
search = ["ุฃ","ุฅ","ุข","ุฉ","_","-","/",".","ุ"," ู "," ูุง ",'"',"ู","'","ู","\\",'\n', '\t','"','?','ุ','!'] |
|
replace = ["ุง","ุง","ุง","ู"," "," ","","",""," ู"," ูุง","","","","ู","",' ', ' ',' ',' ? ',' ุ ',' ! '] |
|
|
|
|
|
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]') |
|
text = re.sub(p_tashkeel,"", text) |
|
|
|
|
|
p_longation = re.compile(r'(.)\1+') |
|
subst = r"\1\1" |
|
text = re.sub(p_longation, subst, text) |
|
|
|
text = text.replace('ูู', 'ู') |
|
text = text.replace('ูู', 'ู') |
|
text = text.replace('ุงุง', 'ุง') |
|
|
|
for i in range(0, len(search)): |
|
text = text.replace(search[i], replace[i]) |
|
|
|
|
|
text = text.strip() |
|
|
|
return text |
|
|
|
|
|
def split_hashtag_to_words(tag): |
|
tag = tag.replace('#','') |
|
tags = tag.split('_') |
|
if len(tags) > 1 : |
|
|
|
return tags |
|
pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") |
|
return pattern.findall(tag) |
|
|
|
def clean_hashtag(text): |
|
words = text.split() |
|
text = list() |
|
for word in words: |
|
if is_hashtag(word): |
|
text.extend(extract_hashtag(word)) |
|
else: |
|
text.append(word) |
|
return " ".join(text) |
|
|
|
def is_hashtag(word): |
|
if word.startswith("#"): |
|
return True |
|
else: |
|
return False |
|
|
|
def extract_hashtag(text): |
|
|
|
hash_list = ([re.sub(r"(\W+)$", "", i) for i in text.split() if i.startswith("#")]) |
|
word_list = [] |
|
for word in hash_list : |
|
word_list.extend(split_hashtag_to_words(word)) |
|
return word_list |
|
|
|
|
|
class Preprocessor: |
|
def __init__(self, tokenizer, **cfg): |
|
self.tokenizer = tokenizer |
|
|
|
def __call__(self, text): |
|
preprocessed = clean_str(text) |
|
return self.tokenizer(preprocessed) |
|
|
|
|
|
|
|
|
|
|
|
|
|
nlp.tokenizer = Preprocessor(nlp.tokenizer) |
|
|
|
if len(tx) != 0: |
|
googlenews = GoogleNews(lang='ar') |
|
googlenews.clear() |
|
|
|
f =0 |
|
Prediction ='' |
|
top_similar_ind ='' |
|
top_similar_news ='' |
|
medium ='' |
|
top_similar_ind2 ='' |
|
tp_desc ='' |
|
|
|
st.markdown(f"Searching for: { tx }") |
|
st.markdown(f"ูููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููููู") |
|
|
|
|
|
tx = clean_hashtag(tx) |
|
tx = clean_str(tx) |
|
|
|
|
|
googlenews.search(tx) |
|
result = googlenews.page_at(1) |
|
googlenews.clear() |
|
|
|
if len(result) == 0: |
|
Prediction ='ุงูุฎุจุฑ ุฒุงุฆู' |
|
top_similar_news ='ูุง ููุฌุฏ ุงุฎุจุงุฑ ู
ู
ุงุซูู' |
|
medium ='ูุง ููุฌุฏ ู
ุตุฏุฑ' |
|
tp_desc ='ูุง ููุฌุฏ ูุตู' |
|
|
|
else: |
|
result_text = {"Text":[]} |
|
|
|
|
|
for i in range(len(result)): |
|
title =result[i]['title'] |
|
result_text['Text'].append(title) |
|
|
|
|
|
result_text2 = {"Text":[]} |
|
|
|
for i in range(len(result)): |
|
desc =result[i]['desc'] |
|
result_text2['Text'].append(desc) |
|
|
|
result_text = pd.DataFrame(result_text) |
|
result_text2 = pd.DataFrame(result_text2) |
|
|
|
data = pd.DataFrame() |
|
data['Text2'] = result_text['Text'].copy() |
|
|
|
data['Text2'] = data['Text2'].apply(lambda x: nlp(x).similarity(nlp(tx))) |
|
sg300top = data['Text2'].max(axis = 0) |
|
|
|
top_similar_ind = np.argmax(data['Text2']) |
|
top_similar_news = result[top_similar_ind]['title'] |
|
descr = result[top_similar_ind]['desc'] |
|
medium = result[top_similar_ind]['media'] |
|
date = result[top_similar_ind]['date'] |
|
link = result[top_similar_ind]['link'] |
|
|
|
data['Text3'] = result_text2['Text'].copy() |
|
data['Text3'] = data['Text3'].apply(lambda x: nlp(x).similarity(nlp(tx))) |
|
sg300top2 = data['Text3'].max(axis = 0) |
|
top_similar_ind2 = np.argmax(data['Text3']) |
|
tp_desc = result[top_similar_ind2]['desc'] |
|
|
|
if sg300top >= .85 or sg300top2 >= .85 : |
|
Prediction ='ุงูุฎุจุฑ ุตุญูุญ' |
|
else: |
|
Prediction =' ุงูุฎุจุฑ ุฒุงุฆู' |
|
|
|
|
|
|
|
st.markdown(f"System Prediction : { Prediction }") |
|
st.markdown(f"ุงูุฎุจุฑ ุงูู
ู
ุงุซู: { top_similar_news }") |
|
st.markdown(f"") |
|
st.markdown(f"ุชุงุฑูุฎ ุงูุฎุจุฑ: { date }") |
|
st.markdown(f"") |
|
st.markdown(f"ุงูุชูุตูู: { descr }") |
|
st.markdown(f"") |
|
st.markdown(f"ุงูู
ุตุฏุฑ: { medium }") |
|
st.markdown(f"") |
|
st.markdown(f"ุฑุงุจุท ุงูุฎุจุฑ: { link }") |
|
|
|
|
|
|
|
|
|
|
|
|