Spaces:

PuppetLover
/

StockInsights

Running

File size: 11,893 Bytes

97b7267
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1c5d3
 
 
 
 
 
 
 
 
 
 
 
 
0657cac
97b7267
ee1c5d3
 
 
97b7267
 
 
ee1c5d3
97b7267
 
ee1c5d3
 
 
 
 
 
97b7267
ee1c5d3
 
97b7267
 
ee1c5d3
97b7267
 
ee1c5d3
0657cac
ee1c5d3
97b7267
 
 
 
ee1c5d3
97b7267

import sqlite3
import pandas as pd
import os
from datetime import datetime
import streamlit as st

# def get_db_path():
#     db_path = "../database/stock_insights.db"
#     if not os.path.exists(db_path) and os.path.exists("/tmp/stock_insights.db"):
#         db_path = "/tmp/stock_insights.db"
#     return db_path

# download from Hugging Face dataset
def ensure_db():
    repo_path = os.path.join(os.getcwd(), "database", "stock_insights.db")
    if os.path.exists(repo_path):
        return repo_path

    candidates = [
        os.path.join("/app", "database", "stock_insights.db"),
        os.path.join("/tmp", "database", "stock_insights.db"),
        os.path.join("/tmp", "stock_insights.db"),
    ]
    for p in candidates:
        if os.path.exists(p):
            return p

    try:
        from huggingface_hub import hf_hub_download
        tmp_dir = os.path.join("/tmp", "database")
        os.makedirs(tmp_dir, exist_ok=True)
        local_file = hf_hub_download(
            repo_id="PuppetLover/stock_insights",
            filename="stock_insights.db",
            repo_type="dataset",
            local_dir=tmp_dir,
            local_dir_use_symlinks=False,
        )
        return local_file
    except Exception as e:
        local_rel = os.path.join("database", "stock_insights.db")
        if os.path.exists(local_rel):
            return local_rel
        raise RuntimeError(f"Cannot access or download database file: {e}")

# gọi và gán hằng DB_PATH dùng trong module
DB_PATH = ensure_db()   

def generate_stock_report(stock_code, time_period):

    start_date, end_date = time_period
    today = datetime.now().date()
    # db_path = os.path.join("database", "stock_insights.db")
    db_path = DB_PATH

    report = {
        "stock_code": stock_code,
        "report_period": f"{start_date} to {end_date}"
    }

    with sqlite3.connect(db_path) as conn:
        # Tạo bảng tạm relevant_articles
        conn.execute("DROP TABLE IF EXISTS relevant_articles;")
        conn.execute("""
            CREATE TEMP TABLE relevant_articles AS
            SELECT DISTINCT article_id FROM entities 
            WHERE entity_text =? 
              AND entity_type IN ('STOCK', 'COMPANY')
              AND confidence = 'high'
              AND article_id IN (
                  SELECT article_id FROM articles WHERE publish_date BETWEEN ? AND ?
              );
        """, (stock_code, start_date, end_date))

        # 1. OVERALL SENTIMENT
        q_sentences = """
            SELECT s.sentiment_score, s.sentiment_label, a.publish_date
            FROM sentences s
            JOIN articles a ON s.article_id = a.article_id
            WHERE s.article_id IN (
                SELECT s2.sentence_id FROM sentences s2
                WHERE s2.article_id IN (SELECT article_id FROM relevant_articles)
            )
            AND s.sentiment_score IS NOT NULL;
        """
        df_sent = pd.read_sql_query(q_sentences, conn)
        
        if not df_sent.empty:
            df_sent['publish_date'] = pd.to_datetime(df_sent['publish_date']).dt.date
            df_sent['days_ago'] = (today - df_sent['publish_date']).apply(lambda x: x.days)
            df_sent['weight'] = 1 / (df_sent['days_ago'] + 1)
            weighted_score = (df_sent['sentiment_score'] * df_sent['weight']).sum() / df_sent['weight'].sum()
            # Chuẩn hóa nhãn sentiment về lower-case
            df_sent['sentiment_label'] = df_sent['sentiment_label'].str.lower()
            sentiment_counts = df_sent['sentiment_label'].value_counts().to_dict()
            trend = "Tích cực" if weighted_score > 0.1 else "Tiêu cực" if weighted_score < -0.1 else "Trung tính"
        else:
            weighted_score, sentiment_counts, trend = 0.0, {}, "Không có dữ liệu"

        report["overall_sentiment"] = {
            "score": weighted_score,
            "trend": trend,
            "positive_mentions": sentiment_counts.get("positive", 0),
            "negative_mentions": sentiment_counts.get("negative", 0),
            "neutral_mentions": sentiment_counts.get("neutral", 0)
        }

        # 2. KEY EVENTS, RISKS, PRICE ACTIONS
        def get_key_entities(entity_type):
            query = f"""
                SELECT
                    e.entity_text,
                    COUNT(e.entity_id) as count,
                    AVG(s.sentiment_score) as avg_sentiment
                FROM entities e
                JOIN sentences s ON e.sentence_id = s.sentence_id
                WHERE e.article_id IN (SELECT article_id FROM relevant_articles)
                  AND e.entity_type =?
                GROUP BY e.entity_text
                ORDER BY count DESC
                LIMIT 5;
            """
            df = pd.read_sql_query(query, conn, params=(entity_type,))
            def score_to_label(score):
                if score is None: return "N/A"
                return "Tích cực" if score > 0.1 else "Tiêu cực" if score < -0.1 else "Trung tính"
            df['sentiment'] = df['avg_sentiment'].apply(score_to_label)
            return df.to_dict('records')

        report["key_events"] = get_key_entities('EVENT')
        report["key_price_actions"] = get_key_entities('PRICE_ACTION')
        report["key_risks_mentioned"] = get_key_entities('RISK')

        # 3. TOP RELATED ENTITIES
        q_related = """
            SELECT e.entity_type, e.entity_text
            FROM entities e
            WHERE e.article_id IN (SELECT article_id FROM relevant_articles)
              AND e.entity_text!=?
              AND e.entity_type IN ('STOCK', 'COMPANY', 'PERSON');
        """
        df_related = pd.read_sql_query(q_related, conn, params=(stock_code,))
        top_related = {}
        if not df_related.empty:
            for etype in ['STOCK', 'COMPANY', 'PERSON']:
                top_related[etype.lower() + 's'] = df_related[df_related['entity_type'] == etype]['entity_text'].value_counts().head(3).index.tolist()
        report["top_related_entities"] = top_related

        # 4. SOURCE ARTICLES
        q_articles = """
            SELECT a.title, a.source_url, s.sentiment_label
            FROM articles a
            JOIN sentences s ON a.article_id = s.article_id
            WHERE a.article_id IN (SELECT article_id FROM relevant_articles)
            GROUP BY a.article_id
            ORDER BY a.publish_date DESC
            LIMIT 5;
        """
        df_articles = pd.read_sql_query(q_articles, conn)
        report["source_articles"] = df_articles.to_dict('records')

    return report

# --- HIỂN THỊ BÁO CÁO ---
def show_report(report_data, summary, stock_code_input):
    st.markdown(
        f"<h3 style='text-align: center; color: #30cfd0; margin-top:2rem;'>Báo cáo Phân tích cho {report_data.get('stock_code', stock_code_input)}</h3>", unsafe_allow_html=True)
    st.markdown(
        f"<p style='text-align: center; color: #94a3b8;'>Giai đoạn: {report_data.get('report_period', 'N/A')}</p>", unsafe_allow_html=True)

    st.markdown("#### 🤖 Tóm tắt từ AI")
    st.info(summary)

    # Tổng quan cảm xúc
    st.markdown("#### 📊 Tổng quan Cảm xúc")
    sentiment = report_data['overall_sentiment']
    score = sentiment['score']
    trend_color = "normal"
    if sentiment['trend'] == "Tích cực":
        trend_color = "normal"
        if sentiment['trend'] == "Tiêu cực":
            trend_color = "inverse"

    st.metric(
        label="Điểm Cảm xúc (có trọng số thời gian)",
        value=f"{score:.2f}" if score is not None else "N/A",
        delta=sentiment['trend'],
        delta_color=trend_color
    )

    col1, col2, col3 = st.columns(3)
    col1.metric("👍 Tích cực", sentiment['positive_mentions'])
    col2.metric("👎 Tiêu cực", sentiment['negative_mentions'])
    col3.metric("😐 Trung tính", sentiment['neutral_mentions'])

    # Các bảng chi tiết
    st.markdown("---")

    col_events, col_risks = st.columns(2)
    with col_events:
        st.markdown("#### ⚡ Sự kiện Nổi bật")
        if report_data["key_events"]:
            # Kiểm tra key thực tế
            df_events = pd.DataFrame(report_data["key_events"])
            if 'avg_sentiment' in df_events.columns:
                df_events = df_events.rename(
                    columns={'entity_text': 'Sự kiện', 'avg_sentiment': 'Sentiment'})
                show_cols = ['Sự kiện', 'count', 'Sentiment']
            elif 'sentiment' in df_events.columns:
                df_events = df_events.rename(
                    columns={'entity_text': 'Sự kiện'})
                show_cols = ['Sự kiện', 'count', 'sentiment']
            else:
                df_events = df_events.rename(
                    columns={'entity_text': 'Sự kiện'})
                show_cols = ['Sự kiện', 'count']
            st.dataframe(df_events[show_cols], use_container_width=True)
        else:
            st.write("Không có sự kiện nổi bật.")

    with col_risks:
        st.markdown("#### ⚠️ Rủi ro được đề cập")
        if report_data["key_risks_mentioned"]:
            df_risks = pd.DataFrame(report_data["key_risks_mentioned"])
            if 'avg_sentiment' in df_risks.columns:
                df_risks = df_risks.rename(
                    columns={'entity_text': 'Rủi ro', 'avg_sentiment': 'Sentiment'})
                show_cols = ['Rủi ro', 'count', 'Sentiment']
            elif 'sentiment' in df_risks.columns:
                df_risks = df_risks.rename(
                    columns={'entity_text': 'Rủi ro'})
                show_cols = ['Rủi ro', 'count', 'sentiment']
            else:
                df_risks = df_risks.rename(
                    columns={'entity_text': 'Rủi ro'})
                show_cols = ['Rủi ro', 'count']
            st.dataframe(df_risks[show_cols], use_container_width=True)
        else:
            st.write("Không có rủi ro nổi bật.")

    st.markdown("#### 📈 Hành động Giá Chính")
    if report_data["key_price_actions"]:
        df_price = pd.DataFrame(report_data["key_price_actions"])
        if 'avg_sentiment' in df_price.columns:
            df_price = df_price.rename(
                columns={'entity_text': 'Hành động giá', 'avg_sentiment': 'Sentiment'})
            show_cols = ['Hành động giá', 'count', 'Sentiment']
        elif 'sentiment' in df_price.columns:
            df_price = df_price.rename(
                columns={'entity_text': 'Hành động giá'})
            show_cols = ['Hành động giá', 'count', 'sentiment']
        else:
            df_price = df_price.rename(
                columns={'entity_text': 'Hành động giá'})
            show_cols = ['Hành động giá', 'count']
        st.dataframe(df_price[show_cols], use_container_width=True)
    else:
        st.write("Không có hành động giá nổi bật.")

    # Thực thể liên quan
    st.markdown("---")
    st.markdown("#### 🔗 Các Thực thể Liên quan nhiều nhất")
    related = report_data['top_related_entities']
    if any(related.values()):
        for etype, entities in related.items():
            if entities:
                st.markdown(
                    f"**{etype.replace('_', ' ').title()}:** {', '.join(entities)}")
    else:
        st.write("Không tìm thấy thực thể liên quan nổi bật.")

    # Nguồn bài viết
    st.markdown("---")
    st.markdown("#### 📰 Nguồn Bài viết Tham khảo")
    if report_data["source_articles"]:
        for article in report_data["source_articles"]:
            st.markdown(
                f"- [{article['title']}]({article['source_url']}) - *Cảm xúc: {article['sentiment_label']}*")
    else:
        st.write("Không có bài viết nào trong khoảng thời gian này.")

    st.markdown("</div>", unsafe_allow_html=True)