File size: 5,773 Bytes
74b4cd7 ad25769 74b4cd7 ad25769 74b4cd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import streamlit as st
import pdfplumber, re
from transformers import pipeline, AutoTokenizer
# βββββββββββββββββ Cached pipelines ββββββββββββββββββββββββββββββββββββ
@st.cache_resource(ttl=86400)
def load_pipes():
summarizer = pipeline("summarization", model=SUMM_MODEL)
tokenizer = AutoTokenizer.from_pretrained( SUMM_MODEL)
sentiment = pipeline("text-classification", model=SENT_MODEL)
ner = pipeline("token-classification", model=NER_MODEL,
aggregation_strategy="simple")
return summarizer, tokenizer, sentiment, ner
# βββββββββββββββββ Helper functions ββββββββββββββββββββββββββββββββββββ
def split_by_tokens(text, max_tokens):
words = re.split(r"(\s+)", text)
buf, n = "", 0
for w in words:
ln = len(TOK(w).input_ids)
if n + ln <= max_tokens:
buf, n = buf + w, n + ln
else:
yield buf.strip(); buf, n = w, ln
if buf.strip(): yield buf.strip()
def summarise(text):
parts = list(split_by_tokens(text, MAX_TOK))
per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts))))
first = [SUMMAR(p, max_length=per_len,
min_length=per_len//2,
do_sample=False)[0]["summary_text"]
for p in parts]
joined = " ".join(first)
if len(joined.split()) > TARGET_WORDS:
joined = SUMMAR(joined, max_length=TARGET_WORDS,
min_length=TARGET_WORDS//2,
do_sample=False)[0]["summary_text"]
return joined
def shorten(summary, n):
s = summary.split(". ")
return (". ".join(s[:n]).rstrip(".") + ".") if len(s) > n else summary
def extract_pdf(file):
txt=""
with pdfplumber.open(file) as pdf:
for p in pdf.pages: txt += p.extract_text() or ""
return txt
def tag_entities(text):
tt = {"Organization":[], "Person":[], "Location":[], "Miscellaneous":[]}
for e in NER(text):
grp = {"ORG":"Organization","PER":"Person",
"LOC":"Location"}.get(e["entity_group"],"Miscellaneous")
tt[grp].append(e["word"])
return {k: sorted(set(v)) for k,v in tt.items() if v}
# βββββββββββββββββ Main Part βββββββββββββββββββββββββββββββββββββββ
st.set_page_config(page_title="Financial News Analyzer",
page_icon="π°",
layout="wide")
st.title("π° Financial News Analyzer")
st.markdown("##### Instantly grasp news content, sentiment, and relevant entities")
# models and other constant variables
SUMM_MODEL = "sshleifer/distilbart-cnn-12-6"
SENT_MODEL = "nynn/Fintuned_Sentiment"
NER_MODEL = "Babelscape/wikineural-multilingual-ner"
SUMMAR, TOK, SENT_CLF, NER = load_pipes()
MAX_TOK = 1024
TARGET_WORDS = 225
LABEL_MAP = {"LABEL_0":"Negative","LABEL_1":"Positive","LABEL_2":"Neutral"}
COLOR_MAP = {"Positive":"green","Negative":"red","Neutral":"gray"}
# βββββββββββββββββ Sidebar input βββββββββββββββββββββββββββββββββββββββ
with st.sidebar:
st.header("Input News to Analyze:")
txt_input = st.text_area("Paste news article", height=150)
pdf_file = st.file_uploader("Or upload PDF", type=["pdf"])
sent_count = st.slider("Summary length (sentences)",
min_value=1, max_value=5, value=3, step=1)
run_btn = st.button("π Analyze", use_container_width=True)
raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip()
# βββββββββββββββββ Main pipeline βββββββββββββββββββββββββββββββββββββββ
if run_btn:
if not raw_text:
st.warning("Please provide text or a PDF first.")
st.stop()
with st.spinner("Analyzing"):
full_sum = summarise(raw_text)
summary = shorten(full_sum, sent_count)
cols = st.columns([2,1])
with cols[0]:
st.subheader("π Summary")
st.write(summary)
with cols[1]:
res = SENT_CLF(summary)[0]
label = LABEL_MAP.get(res["label"], res["label"])
colour= COLOR_MAP[label]
st.subheader("π Sentiment")
st.markdown(f"<h3 style='color:{colour};margin-bottom:0'>{label}</h3>"
f"{res['score']*100:.1f}% Confidence</p>",
unsafe_allow_html=True)
tags = tag_entities(summary)
st.subheader("π·οΈ Relevant Tags")
if tags:
# CSS for the badge pills
pill_css = """
<style>
.tag-pill {
display: inline-block;
background: #f0f2f6;
color: #333;
padding: 4px 10px;
margin: 2px 4px 2px 0;
border-radius: 12px;
font-size: 0.9em;
}
.tag-cat {
font-weight: 600;
margin-top: 0;
margin-bottom: 4px;
}
</style>
"""
st.markdown(pill_css, unsafe_allow_html=True)
# Render each category as a header + pills
for category, vals in tags.items():
st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True)
pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in vals)
st.markdown(pills, unsafe_allow_html=True)
else:
st.info("No entities detected.")
|