File size: 11,893 Bytes
97b7267
 
 
 
 
 
 
 
 
 
 
 
 
 
ee1c5d3
 
 
 
 
 
 
 
 
 
 
 
 
0657cac
97b7267
ee1c5d3
 
 
97b7267
 
 
ee1c5d3
97b7267
 
ee1c5d3
 
 
 
 
 
97b7267
ee1c5d3
 
97b7267
 
ee1c5d3
97b7267
 
ee1c5d3
0657cac
ee1c5d3
97b7267
 
 
 
ee1c5d3
97b7267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import sqlite3
import pandas as pd
import os
from datetime import datetime
import streamlit as st

# def get_db_path():
#     db_path = "../database/stock_insights.db"
#     if not os.path.exists(db_path) and os.path.exists("/tmp/stock_insights.db"):
#         db_path = "/tmp/stock_insights.db"
#     return db_path

# download from Hugging Face dataset
def ensure_db():
    repo_path = os.path.join(os.getcwd(), "database", "stock_insights.db")
    if os.path.exists(repo_path):
        return repo_path

    candidates = [
        os.path.join("/app", "database", "stock_insights.db"),
        os.path.join("/tmp", "database", "stock_insights.db"),
        os.path.join("/tmp", "stock_insights.db"),
    ]
    for p in candidates:
        if os.path.exists(p):
            return p

    try:
        from huggingface_hub import hf_hub_download
        tmp_dir = os.path.join("/tmp", "database")
        os.makedirs(tmp_dir, exist_ok=True)
        local_file = hf_hub_download(
            repo_id="PuppetLover/stock_insights",
            filename="stock_insights.db",
            repo_type="dataset",
            local_dir=tmp_dir,
            local_dir_use_symlinks=False,
        )
        return local_file
    except Exception as e:
        local_rel = os.path.join("database", "stock_insights.db")
        if os.path.exists(local_rel):
            return local_rel
        raise RuntimeError(f"Cannot access or download database file: {e}")

# gọi và gán hằng DB_PATH dùng trong module
DB_PATH = ensure_db()   

def generate_stock_report(stock_code, time_period):

    start_date, end_date = time_period
    today = datetime.now().date()
    # db_path = os.path.join("database", "stock_insights.db")
    db_path = DB_PATH

    report = {
        "stock_code": stock_code,
        "report_period": f"{start_date} to {end_date}"
    }

    with sqlite3.connect(db_path) as conn:
        # Tạo bảng tạm relevant_articles
        conn.execute("DROP TABLE IF EXISTS relevant_articles;")
        conn.execute("""
            CREATE TEMP TABLE relevant_articles AS
            SELECT DISTINCT article_id FROM entities 
            WHERE entity_text =? 
              AND entity_type IN ('STOCK', 'COMPANY')
              AND confidence = 'high'
              AND article_id IN (
                  SELECT article_id FROM articles WHERE publish_date BETWEEN ? AND ?
              );
        """, (stock_code, start_date, end_date))

        # 1. OVERALL SENTIMENT
        q_sentences = """
            SELECT s.sentiment_score, s.sentiment_label, a.publish_date
            FROM sentences s
            JOIN articles a ON s.article_id = a.article_id
            WHERE s.article_id IN (
                SELECT s2.sentence_id FROM sentences s2
                WHERE s2.article_id IN (SELECT article_id FROM relevant_articles)
            )
            AND s.sentiment_score IS NOT NULL;
        """
        df_sent = pd.read_sql_query(q_sentences, conn)
        
        if not df_sent.empty:
            df_sent['publish_date'] = pd.to_datetime(df_sent['publish_date']).dt.date
            df_sent['days_ago'] = (today - df_sent['publish_date']).apply(lambda x: x.days)
            df_sent['weight'] = 1 / (df_sent['days_ago'] + 1)
            weighted_score = (df_sent['sentiment_score'] * df_sent['weight']).sum() / df_sent['weight'].sum()
            # Chuẩn hóa nhãn sentiment về lower-case
            df_sent['sentiment_label'] = df_sent['sentiment_label'].str.lower()
            sentiment_counts = df_sent['sentiment_label'].value_counts().to_dict()
            trend = "Tích cực" if weighted_score > 0.1 else "Tiêu cực" if weighted_score < -0.1 else "Trung tính"
        else:
            weighted_score, sentiment_counts, trend = 0.0, {}, "Không có dữ liệu"

        report["overall_sentiment"] = {
            "score": weighted_score,
            "trend": trend,
            "positive_mentions": sentiment_counts.get("positive", 0),
            "negative_mentions": sentiment_counts.get("negative", 0),
            "neutral_mentions": sentiment_counts.get("neutral", 0)
        }

        # 2. KEY EVENTS, RISKS, PRICE ACTIONS
        def get_key_entities(entity_type):
            query = f"""
                SELECT
                    e.entity_text,
                    COUNT(e.entity_id) as count,
                    AVG(s.sentiment_score) as avg_sentiment
                FROM entities e
                JOIN sentences s ON e.sentence_id = s.sentence_id
                WHERE e.article_id IN (SELECT article_id FROM relevant_articles)
                  AND e.entity_type =?
                GROUP BY e.entity_text
                ORDER BY count DESC
                LIMIT 5;
            """
            df = pd.read_sql_query(query, conn, params=(entity_type,))
            def score_to_label(score):
                if score is None: return "N/A"
                return "Tích cực" if score > 0.1 else "Tiêu cực" if score < -0.1 else "Trung tính"
            df['sentiment'] = df['avg_sentiment'].apply(score_to_label)
            return df.to_dict('records')

        report["key_events"] = get_key_entities('EVENT')
        report["key_price_actions"] = get_key_entities('PRICE_ACTION')
        report["key_risks_mentioned"] = get_key_entities('RISK')

        # 3. TOP RELATED ENTITIES
        q_related = """
            SELECT e.entity_type, e.entity_text
            FROM entities e
            WHERE e.article_id IN (SELECT article_id FROM relevant_articles)
              AND e.entity_text!=?
              AND e.entity_type IN ('STOCK', 'COMPANY', 'PERSON');
        """
        df_related = pd.read_sql_query(q_related, conn, params=(stock_code,))
        top_related = {}
        if not df_related.empty:
            for etype in ['STOCK', 'COMPANY', 'PERSON']:
                top_related[etype.lower() + 's'] = df_related[df_related['entity_type'] == etype]['entity_text'].value_counts().head(3).index.tolist()
        report["top_related_entities"] = top_related

        # 4. SOURCE ARTICLES
        q_articles = """
            SELECT a.title, a.source_url, s.sentiment_label
            FROM articles a
            JOIN sentences s ON a.article_id = s.article_id
            WHERE a.article_id IN (SELECT article_id FROM relevant_articles)
            GROUP BY a.article_id
            ORDER BY a.publish_date DESC
            LIMIT 5;
        """
        df_articles = pd.read_sql_query(q_articles, conn)
        report["source_articles"] = df_articles.to_dict('records')

    return report

# --- HIỂN THỊ BÁO CÁO ---
def show_report(report_data, summary, stock_code_input):
    st.markdown(
        f"<h3 style='text-align: center; color: #30cfd0; margin-top:2rem;'>Báo cáo Phân tích cho {report_data.get('stock_code', stock_code_input)}</h3>", unsafe_allow_html=True)
    st.markdown(
        f"<p style='text-align: center; color: #94a3b8;'>Giai đoạn: {report_data.get('report_period', 'N/A')}</p>", unsafe_allow_html=True)

    st.markdown("#### 🤖 Tóm tắt từ AI")
    st.info(summary)

    # Tổng quan cảm xúc
    st.markdown("#### 📊 Tổng quan Cảm xúc")
    sentiment = report_data['overall_sentiment']
    score = sentiment['score']
    trend_color = "normal"
    if sentiment['trend'] == "Tích cực":
        trend_color = "normal"
        if sentiment['trend'] == "Tiêu cực":
            trend_color = "inverse"

    st.metric(
        label="Điểm Cảm xúc (có trọng số thời gian)",
        value=f"{score:.2f}" if score is not None else "N/A",
        delta=sentiment['trend'],
        delta_color=trend_color
    )

    col1, col2, col3 = st.columns(3)
    col1.metric("👍 Tích cực", sentiment['positive_mentions'])
    col2.metric("👎 Tiêu cực", sentiment['negative_mentions'])
    col3.metric("😐 Trung tính", sentiment['neutral_mentions'])

    # Các bảng chi tiết
    st.markdown("---")

    col_events, col_risks = st.columns(2)
    with col_events:
        st.markdown("#### ⚡ Sự kiện Nổi bật")
        if report_data["key_events"]:
            # Kiểm tra key thực tế
            df_events = pd.DataFrame(report_data["key_events"])
            if 'avg_sentiment' in df_events.columns:
                df_events = df_events.rename(
                    columns={'entity_text': 'Sự kiện', 'avg_sentiment': 'Sentiment'})
                show_cols = ['Sự kiện', 'count', 'Sentiment']
            elif 'sentiment' in df_events.columns:
                df_events = df_events.rename(
                    columns={'entity_text': 'Sự kiện'})
                show_cols = ['Sự kiện', 'count', 'sentiment']
            else:
                df_events = df_events.rename(
                    columns={'entity_text': 'Sự kiện'})
                show_cols = ['Sự kiện', 'count']
            st.dataframe(df_events[show_cols], use_container_width=True)
        else:
            st.write("Không có sự kiện nổi bật.")

    with col_risks:
        st.markdown("#### ⚠️ Rủi ro được đề cập")
        if report_data["key_risks_mentioned"]:
            df_risks = pd.DataFrame(report_data["key_risks_mentioned"])
            if 'avg_sentiment' in df_risks.columns:
                df_risks = df_risks.rename(
                    columns={'entity_text': 'Rủi ro', 'avg_sentiment': 'Sentiment'})
                show_cols = ['Rủi ro', 'count', 'Sentiment']
            elif 'sentiment' in df_risks.columns:
                df_risks = df_risks.rename(
                    columns={'entity_text': 'Rủi ro'})
                show_cols = ['Rủi ro', 'count', 'sentiment']
            else:
                df_risks = df_risks.rename(
                    columns={'entity_text': 'Rủi ro'})
                show_cols = ['Rủi ro', 'count']
            st.dataframe(df_risks[show_cols], use_container_width=True)
        else:
            st.write("Không có rủi ro nổi bật.")

    st.markdown("#### 📈 Hành động Giá Chính")
    if report_data["key_price_actions"]:
        df_price = pd.DataFrame(report_data["key_price_actions"])
        if 'avg_sentiment' in df_price.columns:
            df_price = df_price.rename(
                columns={'entity_text': 'Hành động giá', 'avg_sentiment': 'Sentiment'})
            show_cols = ['Hành động giá', 'count', 'Sentiment']
        elif 'sentiment' in df_price.columns:
            df_price = df_price.rename(
                columns={'entity_text': 'Hành động giá'})
            show_cols = ['Hành động giá', 'count', 'sentiment']
        else:
            df_price = df_price.rename(
                columns={'entity_text': 'Hành động giá'})
            show_cols = ['Hành động giá', 'count']
        st.dataframe(df_price[show_cols], use_container_width=True)
    else:
        st.write("Không có hành động giá nổi bật.")

    # Thực thể liên quan
    st.markdown("---")
    st.markdown("#### 🔗 Các Thực thể Liên quan nhiều nhất")
    related = report_data['top_related_entities']
    if any(related.values()):
        for etype, entities in related.items():
            if entities:
                st.markdown(
                    f"**{etype.replace('_', ' ').title()}:** {', '.join(entities)}")
    else:
        st.write("Không tìm thấy thực thể liên quan nổi bật.")

    # Nguồn bài viết
    st.markdown("---")
    st.markdown("#### 📰 Nguồn Bài viết Tham khảo")
    if report_data["source_articles"]:
        for article in report_data["source_articles"]:
            st.markdown(
                f"- [{article['title']}]({article['source_url']}) - *Cảm xúc: {article['sentiment_label']}*")
    else:
        st.write("Không có bài viết nào trong khoảng thời gian này.")

    st.markdown("</div>", unsafe_allow_html=True)