|
import streamlit as st |
|
import pdfplumber, re |
|
from transformers import pipeline, AutoTokenizer |
|
|
|
|
|
@st.cache_resource(ttl=86400) |
|
def load_pipes(): |
|
summarizer = pipeline("summarization", model=SUMM_MODEL) |
|
tokenizer = AutoTokenizer.from_pretrained( SUMM_MODEL) |
|
sentiment = pipeline("text-classification", model=SENT_MODEL) |
|
ner = pipeline("token-classification", model=NER_MODEL, |
|
aggregation_strategy="simple") |
|
return summarizer, tokenizer, sentiment, ner |
|
|
|
|
|
def split_by_tokens(text, max_tokens): |
|
words = re.split(r"(\s+)", text) |
|
buf, n = "", 0 |
|
for w in words: |
|
ln = len(TOK(w).input_ids) |
|
if n + ln <= max_tokens: |
|
buf, n = buf + w, n + ln |
|
else: |
|
yield buf.strip(); buf, n = w, ln |
|
if buf.strip(): yield buf.strip() |
|
|
|
def summarise(text): |
|
parts = list(split_by_tokens(text, MAX_TOK)) |
|
per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts)))) |
|
first = [SUMMAR(p, max_length=per_len, |
|
min_length=per_len//2, |
|
do_sample=False)[0]["summary_text"] |
|
for p in parts] |
|
joined = " ".join(first) |
|
if len(joined.split()) > TARGET_WORDS: |
|
joined = SUMMAR(joined, max_length=TARGET_WORDS, |
|
min_length=TARGET_WORDS//2, |
|
do_sample=False)[0]["summary_text"] |
|
return joined |
|
|
|
def shorten(summary, n): |
|
s = summary.split(". ") |
|
return (". ".join(s[:n]).rstrip(".") + ".") if len(s) > n else summary |
|
|
|
def extract_pdf(file): |
|
txt="" |
|
with pdfplumber.open(file) as pdf: |
|
for p in pdf.pages: txt += p.extract_text() or "" |
|
return txt |
|
|
|
def tag_entities(text): |
|
tt = {"Organization":[], "Person":[], "Location":[], "Miscellaneous":[]} |
|
for e in NER(text): |
|
grp = {"ORG":"Organization","PER":"Person", |
|
"LOC":"Location"}.get(e["entity_group"],"Miscellaneous") |
|
tt[grp].append(e["word"]) |
|
return {k: sorted(set(v)) for k,v in tt.items() if v} |
|
|
|
|
|
st.set_page_config(page_title="Financial News Analyzer", |
|
page_icon="π°", |
|
layout="wide") |
|
st.title("π° Financial News Analyzer") |
|
st.markdown("##### Instantly grasp news content, sentiment, and relevant entities") |
|
|
|
|
|
SUMM_MODEL = "sshleifer/distilbart-cnn-12-6" |
|
SENT_MODEL = "nynn/Fintuned_Sentiment" |
|
NER_MODEL = "Babelscape/wikineural-multilingual-ner" |
|
SUMMAR, TOK, SENT_CLF, NER = load_pipes() |
|
|
|
MAX_TOK = 1024 |
|
TARGET_WORDS = 225 |
|
LABEL_MAP = {"LABEL_0":"Negative","LABEL_1":"Positive","LABEL_2":"Neutral"} |
|
COLOR_MAP = {"Positive":"green","Negative":"red","Neutral":"gray"} |
|
|
|
|
|
with st.sidebar: |
|
st.header("Input News to Analyze:") |
|
txt_input = st.text_area("Paste news article", height=150) |
|
pdf_file = st.file_uploader("Or upload PDF", type=["pdf"]) |
|
sent_count = st.slider("Summary length (sentences)", |
|
min_value=1, max_value=5, value=3, step=1) |
|
run_btn = st.button("π Analyze", use_container_width=True) |
|
|
|
raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip() |
|
|
|
|
|
if run_btn: |
|
if not raw_text: |
|
st.warning("Please provide text or a PDF first.") |
|
st.stop() |
|
|
|
with st.spinner("Analyzing"): |
|
full_sum = summarise(raw_text) |
|
summary = shorten(full_sum, sent_count) |
|
|
|
cols = st.columns([2,1]) |
|
with cols[0]: |
|
st.subheader("π Summary") |
|
st.write(summary) |
|
|
|
with cols[1]: |
|
res = SENT_CLF(summary)[0] |
|
label = LABEL_MAP.get(res["label"], res["label"]) |
|
colour= COLOR_MAP[label] |
|
st.subheader("π Sentiment") |
|
st.markdown(f"<h3 style='color:{colour};margin-bottom:0'>{label}</h3>" |
|
f"{res['score']*100:.1f}% Confidence</p>", |
|
unsafe_allow_html=True) |
|
|
|
tags = tag_entities(summary) |
|
st.subheader("π·οΈ Relevant Tags") |
|
|
|
if tags: |
|
|
|
pill_css = """ |
|
<style> |
|
.tag-pill { |
|
display: inline-block; |
|
background: #f0f2f6; |
|
color: #333; |
|
padding: 4px 10px; |
|
margin: 2px 4px 2px 0; |
|
border-radius: 12px; |
|
font-size: 0.9em; |
|
} |
|
.tag-cat { |
|
font-weight: 600; |
|
margin-top: 0; |
|
margin-bottom: 4px; |
|
} |
|
</style> |
|
""" |
|
st.markdown(pill_css, unsafe_allow_html=True) |
|
|
|
|
|
for category, vals in tags.items(): |
|
st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True) |
|
pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in vals) |
|
st.markdown(pills, unsafe_allow_html=True) |
|
else: |
|
st.info("No entities detected.") |
|
|
|
|
|
|
|
|
|
|