Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -1,249 +1,249 @@
|
|
1 |
-
# Load libaries
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
from datetime import datetime, timedelta
|
5 |
-
import cleantext
|
6 |
-
import re
|
7 |
-
import ast
|
8 |
-
import streamlit as st
|
9 |
-
import spacy
|
10 |
-
from spacy.lang.en import English
|
11 |
-
from dotenv import load_dotenv
|
12 |
-
from subprocess import Popen
|
13 |
-
import scrapy
|
14 |
-
from scrapy import Selector
|
15 |
-
import json
|
16 |
-
import requests
|
17 |
-
|
18 |
-
|
19 |
-
md_intro = '''# Business News Sentiment Dashboard
|
20 |
-
The dashboard has 2 tabs:
|
21 |
-
-
|
22 |
-
-
|
23 |
-
|
24 |
-
Main libraries used: scrapy, SpaCy, PyTorch, transformers, streamlit
|
25 |
-
|
26 |
-
News scope: CNN, BBC, CNBC (other business news sources don't have free access)
|
27 |
-
|
28 |
-
Time scope: up to 3 days (from yesterday to 3 days ago), based on free tier's available data source
|
29 |
-
'''
|
30 |
-
md_sumstats = '''## News Sentiment Summary
|
31 |
-
'''
|
32 |
-
md_table = '''## News Sentiment Report
|
33 |
-
'''
|
34 |
-
md_notes = '''## Notes and Thoughts:
|
35 |
-
Lexicon-based approach may confuse named entities and actual sentiment, because brand names may have positive words, say BeautifulSoup.
|
36 |
-
|
37 |
-
Hence, implementing named entity recognition before sentiment analysis helps to improve accuracy.
|
38 |
-
|
39 |
-
Using RNN-based approach can also overcome lexicon issues, but it also takes more resources.
|
40 |
-
|
41 |
-
## References:
|
42 |
-
https://edition.cnn.com/business
|
43 |
-
|
44 |
-
https://www.bbc.com/business
|
45 |
-
|
46 |
-
https://www.cnbc.com/business/
|
47 |
-
|
48 |
-
https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis
|
49 |
-
|
50 |
-
https://huggingface.co/Venkatesh4342/distilbert-helpdesk-sentence-sentiment
|
51 |
-
|
52 |
-
https://kennethenevoldsen.github.io/asent/introduction.html
|
53 |
-
'''
|
54 |
-
dat_name = './news_db/merged_news_data_' + (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d') + '.csv'
|
55 |
-
news = pd.read_csv(dat_name, on_bad_lines='skip')
|
56 |
-
news[['pos_sent', 'neg_sent']] = news[['pos_sent', 'neg_sent']].fillna('')
|
57 |
-
news['clean_content'] = news.clean_content.apply(lambda x: ast.literal_eval(x))
|
58 |
-
news = news.fillna(value = '')
|
59 |
-
news['clean_date'] = pd.to_datetime(news['clean_date'], dayfirst=True)
|
60 |
-
|
61 |
-
|
62 |
-
# Calculate summary
|
63 |
-
def news_stats(news, method_selection, range_selection):
|
64 |
-
overall_sentiment = 0
|
65 |
-
news_count = 0
|
66 |
-
news['chosen_score'] = np.where((method_selection == 'Lexicon') | (method_selection is None),
|
67 |
-
news['arti_score'], news['rnn_arti_score'])
|
68 |
-
if range_selection == '1 day' or range_selection is None:
|
69 |
-
overall_sentiment = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].chosen_score.mean()
|
70 |
-
news_count = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].title.count()
|
71 |
-
elif range_selection == '3 days':
|
72 |
-
overall_sentiment = news.chosen_score.mean()
|
73 |
-
news_count = news.title.count()
|
74 |
-
return overall_sentiment, news_count
|
75 |
-
|
76 |
-
|
77 |
-
def news_table(news, date_selection, method_selection):
|
78 |
-
if date_selection == 'Yesterday' or date_selection is None:
|
79 |
-
date_selected = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
|
80 |
-
elif date_selection == '2 Days Ago':
|
81 |
-
date_selected = (datetime.today() - timedelta(days=2)).strftime('%Y-%m-%d')
|
82 |
-
elif date_selection == '3 Days Ago':
|
83 |
-
date_selected = (datetime.today() - timedelta(days=3)).strftime('%Y-%m-%d')
|
84 |
-
|
85 |
-
if method_selection == 'Lexicon' or method_selection is None:
|
86 |
-
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'arti_score', 'pos_sent', 'neg_sent', 'clean_date', 'url']]
|
87 |
-
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
|
88 |
-
'arti_score': 'Sentiment Score',
|
89 |
-
'pos_sent': 'Most Positive Sentence',
|
90 |
-
'neg_sent': 'Least Positive Sentence'})
|
91 |
-
|
92 |
-
elif method_selection == 'Transformer':
|
93 |
-
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'rnn_arti_score', 'rnn_pos_sent', 'rnn_neg_sent', 'clean_date', 'url']]
|
94 |
-
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
|
95 |
-
'rnn_arti_score': 'Sentiment Score',
|
96 |
-
'rnn_pos_sent': 'Most Positive Sentence',
|
97 |
-
'rnn_neg_sent': 'Least Positive Sentence'})
|
98 |
-
|
99 |
-
# Formatting for table display
|
100 |
-
clean_news = clean_news.sort_values('Date', ascending = False).reset_index(drop=True)
|
101 |
-
clean_news.loc[:, 'Title'] = clean_news['Title'].str.wrap(width = 40)
|
102 |
-
clean_news.loc[:, 'URL'] = clean_news['URL'].str.wrap(width = 65)
|
103 |
-
clean_news.loc[:, 'Most Positive Sentence'] = clean_news['Most Positive Sentence'].str.wrap(width = 65)
|
104 |
-
clean_news.loc[:, 'Least Positive Sentence'] = clean_news['Least Positive Sentence'].str.wrap(width = 65)
|
105 |
-
|
106 |
-
return clean_news
|
107 |
-
|
108 |
-
|
109 |
-
def bbc_json_extract(bbc_script):
|
110 |
-
json_data = json.loads(bbc_script)
|
111 |
-
res = ''
|
112 |
-
news_key = list(json_data['props']['pageProps']['page'].keys())[0]
|
113 |
-
for item in json_data['props']['pageProps']['page'][news_key]['contents']:
|
114 |
-
if item['type'] == 'text':
|
115 |
-
for block in item['model']['blocks']:
|
116 |
-
if block['type'] == 'paragraph':
|
117 |
-
#res = res + ''.join(block['model']['text']).strip()
|
118 |
-
res = res + block['model']['text'] + ' '
|
119 |
-
return res
|
120 |
-
|
121 |
-
|
122 |
-
def link_extract(link):
|
123 |
-
extracted_content = ''
|
124 |
-
if newslink is not None and newslink != '':
|
125 |
-
if 'https://' in newslink or 'http://' in newslink:
|
126 |
-
clean_link = newslink
|
127 |
-
else:
|
128 |
-
clean_link = 'https://' + newslink
|
129 |
-
html = requests.get(clean_link).content
|
130 |
-
sel = Selector(text = html)
|
131 |
-
if 'www.bbc.com' in clean_link:
|
132 |
-
raw_content = sel.xpath('//body//script//text()').extract()[0]
|
133 |
-
extracted_content = bbc_json_extract(raw_content)
|
134 |
-
elif 'edition.cnn.com' in clean_link:
|
135 |
-
extracted_content = ''.join(sel.xpath('//p[contains(@class, "paragraph") and not(@class="footer__disclaimer-text") and not(@class="footer__copyright-text")]//text()').getall()).strip()
|
136 |
-
elif 'www.cnbc.com' in clean_link:
|
137 |
-
extracted_content = ''.join(sel.xpath('//div[@class="ArticleBody-articleBody"]//p//text()').getall()).strip()
|
138 |
-
return extracted_content
|
139 |
-
|
140 |
-
|
141 |
-
def sentence_breakdown(string):
|
142 |
-
# Transform scraped data to a list of separate sentences
|
143 |
-
sentences = ""
|
144 |
-
if string != "":
|
145 |
-
clean_string = cleantext.clean(string, extra_spaces = True)
|
146 |
-
for ch in [r'\r', r'\n', r'\t', r'\xa0', '\r', '\n', '\t', '\xa0']:
|
147 |
-
if ch in clean_string:
|
148 |
-
clean_string = clean_string.replace(ch, '')
|
149 |
-
nlp = English()
|
150 |
-
nlp.add_pipe('sentencizer')
|
151 |
-
doc = nlp(clean_string)
|
152 |
-
sentences = [sent.text.strip() for sent in doc.sents]
|
153 |
-
return sentences
|
154 |
-
|
155 |
-
|
156 |
-
def rnn_sentence_sentiment(sent):
|
157 |
-
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
|
158 |
-
|
159 |
-
tokenizer = AutoTokenizer.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
|
160 |
-
model = AutoModelForSequenceClassification.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
|
161 |
-
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
162 |
-
prediction = pipe(sent, top_k=1)
|
163 |
-
|
164 |
-
# assign score to sentiment, positive sentiment has positive score, negative sentiment has negative score, neutral has 0
|
165 |
-
if prediction[0]['label'] == 'Positive':
|
166 |
-
res = prediction[0]['score']
|
167 |
-
elif prediction[0]['label'] == 'Negative':
|
168 |
-
res = -prediction[0]['score']
|
169 |
-
elif prediction[0]['label'] == 'Neutral':
|
170 |
-
res = 0
|
171 |
-
|
172 |
-
return res
|
173 |
-
|
174 |
-
|
175 |
-
def article_sentiment(arti):
|
176 |
-
# Input is a list of strings/sentences
|
177 |
-
scores = dict()
|
178 |
-
for sent in arti:
|
179 |
-
scores[sent] = rnn_sentence_sentiment(sent)
|
180 |
-
|
181 |
-
scores_list = list(scores.values())
|
182 |
-
arti_score = np.mean([score for score in scores_list if score != 0])
|
183 |
-
pos_sents = sorted(scores, key=scores.get, reverse=True)[:3]
|
184 |
-
neg_sents = sorted(scores, key=scores.get, reverse=False)[:3]
|
185 |
-
return round(arti_score, 3), pos_sents, neg_sents
|
186 |
-
|
187 |
-
|
188 |
-
st.markdown(md_intro)
|
189 |
-
tab_pred, tab_news = st.tabs(["Sentiment Prediction", "News Report"])
|
190 |
-
with tab_pred:
|
191 |
-
st.write("This is a sentiment prediction module.\nPlease enter your news link into the textbox.\nSentiment prediction will be returned shortly!.\nExample link: https://www.bbc.com/news/technology-68818113")
|
192 |
-
newslink = st.chat_input(placeholder="Please input CNN/BBC/CNBC link")
|
193 |
-
if newslink:
|
194 |
-
placeholder = st.empty()
|
195 |
-
placeholder.text("Running ...")
|
196 |
-
extracted_content = link_extract(newslink)
|
197 |
-
cleaned_content = sentence_breakdown(extracted_content)
|
198 |
-
arti_score, user_pos_sents, user_neg_sents = article_sentiment(cleaned_content)
|
199 |
-
placeholder.empty()
|
200 |
-
|
201 |
-
if newslink:
|
202 |
-
st.markdown(f'### Article sentiment score is: {arti_score}')
|
203 |
-
st.markdown("### Three most positive sentences are: ")
|
204 |
-
st.markdown(f"{user_pos_sents[0]}")
|
205 |
-
st.markdown('''
|
206 |
-
|
207 |
-
''')
|
208 |
-
st.markdown(f'''{user_pos_sents[1]}
|
209 |
-
|
210 |
-
|
211 |
-
''')
|
212 |
-
st.markdown(f'''{user_pos_sents[2]}
|
213 |
-
|
214 |
-
|
215 |
-
''')
|
216 |
-
st.markdown("### Three most negative sentences are: ")
|
217 |
-
st.markdown(f'''{user_neg_sents[0]}
|
218 |
-
|
219 |
-
|
220 |
-
''')
|
221 |
-
st.markdown(f'''{user_neg_sents[1]}
|
222 |
-
|
223 |
-
|
224 |
-
''')
|
225 |
-
st.markdown(f"{user_neg_sents[2]}")
|
226 |
-
|
227 |
-
|
228 |
-
with tab_news:
|
229 |
-
st.markdown(md_sumstats)
|
230 |
-
method_col, range_col = st.columns(2)
|
231 |
-
with method_col:
|
232 |
-
method_selection = st.selectbox("Select Method", ('Lexicon', 'Transformer'))
|
233 |
-
with range_col:
|
234 |
-
range_selection = st.selectbox("Statistics Range", ('1 day', '3 days'))
|
235 |
-
overall_sentiment, news_count = news_stats(news, method_selection, range_selection)
|
236 |
-
senti_col, count_col = st.columns(2)
|
237 |
-
senti_col.metric("Overall Sentiment", str(round(overall_sentiment, 3)))
|
238 |
-
count_col.metric("Number of News", str(news_count))
|
239 |
-
st.markdown(md_table)
|
240 |
-
date_selection = st.selectbox("Extraction Date", ('Yesterday', '2 Days Ago', '3 Days Ago'))
|
241 |
-
clean_news = news_table(news, date_selection, method_selection)
|
242 |
-
st.dataframe(data=clean_news,
|
243 |
-
column_config={"Title": st.column_config.Column(width=250),
|
244 |
-
"Most Positive Sentence": st.column_config.Column(width=400),
|
245 |
-
"Least Positive Sentence": st.column_config.Column(width=400),
|
246 |
-
"Date": st.column_config.DateColumn(format="DD-MM-YYYY"),
|
247 |
-
"URL": st.column_config.LinkColumn("App URL", width=400)
|
248 |
-
})
|
249 |
st.markdown(md_notes)
|
|
|
1 |
+
# Load libaries
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from datetime import datetime, timedelta
|
5 |
+
import cleantext
|
6 |
+
import re
|
7 |
+
import ast
|
8 |
+
import streamlit as st
|
9 |
+
import spacy
|
10 |
+
from spacy.lang.en import English
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from subprocess import Popen
|
13 |
+
import scrapy
|
14 |
+
from scrapy import Selector
|
15 |
+
import json
|
16 |
+
import requests
|
17 |
+
|
18 |
+
|
19 |
+
md_intro = '''# Business News Sentiment Dashboard
|
20 |
+
The dashboard has 2 tabs:
|
21 |
+
- Sentiment prediction: receives a news link and outputs sentiment results
|
22 |
+
- News sentiment report: reports the sentiment of business news from past few days
|
23 |
+
|
24 |
+
Main libraries used: scrapy, SpaCy, PyTorch, transformers, streamlit
|
25 |
+
|
26 |
+
News scope: CNN, BBC, CNBC (other business news sources don't have free access)
|
27 |
+
|
28 |
+
Time scope: up to 3 days (from yesterday to 3 days ago), based on free tier's available data source
|
29 |
+
'''
|
30 |
+
md_sumstats = '''## News Sentiment Summary
|
31 |
+
'''
|
32 |
+
md_table = '''## News Sentiment Report
|
33 |
+
'''
|
34 |
+
md_notes = '''## Notes and Thoughts:
|
35 |
+
Lexicon-based approach may confuse named entities and actual sentiment, because brand names may have positive words, say BeautifulSoup.
|
36 |
+
|
37 |
+
Hence, implementing named entity recognition before sentiment analysis helps to improve accuracy.
|
38 |
+
|
39 |
+
Using RNN-based approach can also overcome lexicon issues, but it also takes more resources.
|
40 |
+
|
41 |
+
## References:
|
42 |
+
https://edition.cnn.com/business
|
43 |
+
|
44 |
+
https://www.bbc.com/business
|
45 |
+
|
46 |
+
https://www.cnbc.com/business/
|
47 |
+
|
48 |
+
https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis
|
49 |
+
|
50 |
+
https://huggingface.co/Venkatesh4342/distilbert-helpdesk-sentence-sentiment
|
51 |
+
|
52 |
+
https://kennethenevoldsen.github.io/asent/introduction.html
|
53 |
+
'''
|
54 |
+
dat_name = './news_db/merged_news_data_' + (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d') + '.csv'
|
55 |
+
news = pd.read_csv(dat_name, on_bad_lines='skip')
|
56 |
+
news[['pos_sent', 'neg_sent']] = news[['pos_sent', 'neg_sent']].fillna('')
|
57 |
+
news['clean_content'] = news.clean_content.apply(lambda x: ast.literal_eval(x))
|
58 |
+
news = news.fillna(value = '')
|
59 |
+
news['clean_date'] = pd.to_datetime(news['clean_date'], dayfirst=True)
|
60 |
+
|
61 |
+
|
62 |
+
# Calculate summary
|
63 |
+
def news_stats(news, method_selection, range_selection):
|
64 |
+
overall_sentiment = 0
|
65 |
+
news_count = 0
|
66 |
+
news['chosen_score'] = np.where((method_selection == 'Lexicon') | (method_selection is None),
|
67 |
+
news['arti_score'], news['rnn_arti_score'])
|
68 |
+
if range_selection == '1 day' or range_selection is None:
|
69 |
+
overall_sentiment = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].chosen_score.mean()
|
70 |
+
news_count = news[news.date_extracted == (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')].title.count()
|
71 |
+
elif range_selection == '3 days':
|
72 |
+
overall_sentiment = news.chosen_score.mean()
|
73 |
+
news_count = news.title.count()
|
74 |
+
return overall_sentiment, news_count
|
75 |
+
|
76 |
+
|
77 |
+
def news_table(news, date_selection, method_selection):
|
78 |
+
if date_selection == 'Yesterday' or date_selection is None:
|
79 |
+
date_selected = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
|
80 |
+
elif date_selection == '2 Days Ago':
|
81 |
+
date_selected = (datetime.today() - timedelta(days=2)).strftime('%Y-%m-%d')
|
82 |
+
elif date_selection == '3 Days Ago':
|
83 |
+
date_selected = (datetime.today() - timedelta(days=3)).strftime('%Y-%m-%d')
|
84 |
+
|
85 |
+
if method_selection == 'Lexicon' or method_selection is None:
|
86 |
+
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'arti_score', 'pos_sent', 'neg_sent', 'clean_date', 'url']]
|
87 |
+
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
|
88 |
+
'arti_score': 'Sentiment Score',
|
89 |
+
'pos_sent': 'Most Positive Sentence',
|
90 |
+
'neg_sent': 'Least Positive Sentence'})
|
91 |
+
|
92 |
+
elif method_selection == 'Transformer':
|
93 |
+
clean_news = news.loc[news.date_extracted == date_selected, ['title', 'rnn_arti_score', 'rnn_pos_sent', 'rnn_neg_sent', 'clean_date', 'url']]
|
94 |
+
clean_news = clean_news.rename(columns = {'title': 'Title', 'url': 'URL', 'clean_date': 'Date',
|
95 |
+
'rnn_arti_score': 'Sentiment Score',
|
96 |
+
'rnn_pos_sent': 'Most Positive Sentence',
|
97 |
+
'rnn_neg_sent': 'Least Positive Sentence'})
|
98 |
+
|
99 |
+
# Formatting for table display
|
100 |
+
clean_news = clean_news.sort_values('Date', ascending = False).reset_index(drop=True)
|
101 |
+
clean_news.loc[:, 'Title'] = clean_news['Title'].str.wrap(width = 40)
|
102 |
+
clean_news.loc[:, 'URL'] = clean_news['URL'].str.wrap(width = 65)
|
103 |
+
clean_news.loc[:, 'Most Positive Sentence'] = clean_news['Most Positive Sentence'].str.wrap(width = 65)
|
104 |
+
clean_news.loc[:, 'Least Positive Sentence'] = clean_news['Least Positive Sentence'].str.wrap(width = 65)
|
105 |
+
|
106 |
+
return clean_news
|
107 |
+
|
108 |
+
|
109 |
+
def bbc_json_extract(bbc_script):
|
110 |
+
json_data = json.loads(bbc_script)
|
111 |
+
res = ''
|
112 |
+
news_key = list(json_data['props']['pageProps']['page'].keys())[0]
|
113 |
+
for item in json_data['props']['pageProps']['page'][news_key]['contents']:
|
114 |
+
if item['type'] == 'text':
|
115 |
+
for block in item['model']['blocks']:
|
116 |
+
if block['type'] == 'paragraph':
|
117 |
+
#res = res + ''.join(block['model']['text']).strip()
|
118 |
+
res = res + block['model']['text'] + ' '
|
119 |
+
return res
|
120 |
+
|
121 |
+
|
122 |
+
def link_extract(link):
|
123 |
+
extracted_content = ''
|
124 |
+
if newslink is not None and newslink != '':
|
125 |
+
if 'https://' in newslink or 'http://' in newslink:
|
126 |
+
clean_link = newslink
|
127 |
+
else:
|
128 |
+
clean_link = 'https://' + newslink
|
129 |
+
html = requests.get(clean_link).content
|
130 |
+
sel = Selector(text = html)
|
131 |
+
if 'www.bbc.com' in clean_link:
|
132 |
+
raw_content = sel.xpath('//body//script//text()').extract()[0]
|
133 |
+
extracted_content = bbc_json_extract(raw_content)
|
134 |
+
elif 'edition.cnn.com' in clean_link:
|
135 |
+
extracted_content = ''.join(sel.xpath('//p[contains(@class, "paragraph") and not(@class="footer__disclaimer-text") and not(@class="footer__copyright-text")]//text()').getall()).strip()
|
136 |
+
elif 'www.cnbc.com' in clean_link:
|
137 |
+
extracted_content = ''.join(sel.xpath('//div[@class="ArticleBody-articleBody"]//p//text()').getall()).strip()
|
138 |
+
return extracted_content
|
139 |
+
|
140 |
+
|
141 |
+
def sentence_breakdown(string):
|
142 |
+
# Transform scraped data to a list of separate sentences
|
143 |
+
sentences = ""
|
144 |
+
if string != "":
|
145 |
+
clean_string = cleantext.clean(string, extra_spaces = True)
|
146 |
+
for ch in [r'\r', r'\n', r'\t', r'\xa0', '\r', '\n', '\t', '\xa0']:
|
147 |
+
if ch in clean_string:
|
148 |
+
clean_string = clean_string.replace(ch, '')
|
149 |
+
nlp = English()
|
150 |
+
nlp.add_pipe('sentencizer')
|
151 |
+
doc = nlp(clean_string)
|
152 |
+
sentences = [sent.text.strip() for sent in doc.sents]
|
153 |
+
return sentences
|
154 |
+
|
155 |
+
|
156 |
+
def rnn_sentence_sentiment(sent):
|
157 |
+
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
|
158 |
+
|
159 |
+
tokenizer = AutoTokenizer.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
|
160 |
+
model = AutoModelForSequenceClassification.from_pretrained("Venkatesh4342/distilbert-helpdesk-sentence-sentiment")
|
161 |
+
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
|
162 |
+
prediction = pipe(sent, top_k=1)
|
163 |
+
|
164 |
+
# assign score to sentiment, positive sentiment has positive score, negative sentiment has negative score, neutral has 0
|
165 |
+
if prediction[0]['label'] == 'Positive':
|
166 |
+
res = prediction[0]['score']
|
167 |
+
elif prediction[0]['label'] == 'Negative':
|
168 |
+
res = -prediction[0]['score']
|
169 |
+
elif prediction[0]['label'] == 'Neutral':
|
170 |
+
res = 0
|
171 |
+
|
172 |
+
return res
|
173 |
+
|
174 |
+
|
175 |
+
def article_sentiment(arti):
|
176 |
+
# Input is a list of strings/sentences
|
177 |
+
scores = dict()
|
178 |
+
for sent in arti:
|
179 |
+
scores[sent] = rnn_sentence_sentiment(sent)
|
180 |
+
|
181 |
+
scores_list = list(scores.values())
|
182 |
+
arti_score = np.mean([score for score in scores_list if score != 0])
|
183 |
+
pos_sents = sorted(scores, key=scores.get, reverse=True)[:3]
|
184 |
+
neg_sents = sorted(scores, key=scores.get, reverse=False)[:3]
|
185 |
+
return round(arti_score, 3), pos_sents, neg_sents
|
186 |
+
|
187 |
+
|
188 |
+
st.markdown(md_intro)
|
189 |
+
tab_pred, tab_news = st.tabs(["Sentiment Prediction", "News Report"])
|
190 |
+
with tab_pred:
|
191 |
+
st.write("This is a sentiment prediction module.\nPlease enter your news link into the textbox.\nSentiment prediction will be returned shortly!.\nExample link: https://www.bbc.com/news/technology-68818113")
|
192 |
+
newslink = st.chat_input(placeholder="Please input CNN/BBC/CNBC link")
|
193 |
+
if newslink:
|
194 |
+
placeholder = st.empty()
|
195 |
+
placeholder.text("Running ...")
|
196 |
+
extracted_content = link_extract(newslink)
|
197 |
+
cleaned_content = sentence_breakdown(extracted_content)
|
198 |
+
arti_score, user_pos_sents, user_neg_sents = article_sentiment(cleaned_content)
|
199 |
+
placeholder.empty()
|
200 |
+
|
201 |
+
if newslink:
|
202 |
+
st.markdown(f'### Article sentiment score is: {arti_score}')
|
203 |
+
st.markdown("### Three most positive sentences are: ")
|
204 |
+
st.markdown(f"{user_pos_sents[0]}")
|
205 |
+
st.markdown('''
|
206 |
+
|
207 |
+
''')
|
208 |
+
st.markdown(f'''{user_pos_sents[1]}
|
209 |
+
|
210 |
+
|
211 |
+
''')
|
212 |
+
st.markdown(f'''{user_pos_sents[2]}
|
213 |
+
|
214 |
+
|
215 |
+
''')
|
216 |
+
st.markdown("### Three most negative sentences are: ")
|
217 |
+
st.markdown(f'''{user_neg_sents[0]}
|
218 |
+
|
219 |
+
|
220 |
+
''')
|
221 |
+
st.markdown(f'''{user_neg_sents[1]}
|
222 |
+
|
223 |
+
|
224 |
+
''')
|
225 |
+
st.markdown(f"{user_neg_sents[2]}")
|
226 |
+
|
227 |
+
|
228 |
+
with tab_news:
|
229 |
+
st.markdown(md_sumstats)
|
230 |
+
method_col, range_col = st.columns(2)
|
231 |
+
with method_col:
|
232 |
+
method_selection = st.selectbox("Select Method", ('Lexicon', 'Transformer'))
|
233 |
+
with range_col:
|
234 |
+
range_selection = st.selectbox("Statistics Range", ('1 day', '3 days'))
|
235 |
+
overall_sentiment, news_count = news_stats(news, method_selection, range_selection)
|
236 |
+
senti_col, count_col = st.columns(2)
|
237 |
+
senti_col.metric("Overall Sentiment", str(round(overall_sentiment, 3)))
|
238 |
+
count_col.metric("Number of News", str(news_count))
|
239 |
+
st.markdown(md_table)
|
240 |
+
date_selection = st.selectbox("Extraction Date", ('Yesterday', '2 Days Ago', '3 Days Ago'))
|
241 |
+
clean_news = news_table(news, date_selection, method_selection)
|
242 |
+
st.dataframe(data=clean_news,
|
243 |
+
column_config={"Title": st.column_config.Column(width=250),
|
244 |
+
"Most Positive Sentence": st.column_config.Column(width=400),
|
245 |
+
"Least Positive Sentence": st.column_config.Column(width=400),
|
246 |
+
"Date": st.column_config.DateColumn(format="DD-MM-YYYY"),
|
247 |
+
"URL": st.column_config.LinkColumn("App URL", width=400)
|
248 |
+
})
|
249 |
st.markdown(md_notes)
|