import sqlite3 import json import random import os import re from transformers import AutoModelForCausalLM, AutoTokenizer # 初始化 GPT 模型 model_name = "EleutherAI/pythia-410m" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # 資料夾 DATA_DIR = "./data" DB_PATH = os.path.join(DATA_DIR, "sentences.db") # 建立資料表 def init_db(): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' CREATE TABLE IF NOT EXISTS sentences ( word TEXT PRIMARY KEY, phonetic TEXT, sentence TEXT, created_at DATETIME DEFAULT CURRENT_TIMESTAMP ) ''') conn.commit() conn.close() # 自動掃描資料夾生成選單 def get_sources(): files = os.listdir(DATA_DIR) sources = [f.split(".json")[0] for f in files if f.endswith(".json")] return sources # 查詢句庫 def get_sentence(word): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute('SELECT word, phonetic, sentence FROM sentences WHERE word=?', (word,)) result = c.fetchone() conn.close() return result # 保存句子到 SQLite def save_sentence(word, phonetic, sentence): conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute(''' INSERT INTO sentences (word, phonetic, sentence) VALUES (?, ?, ?) ON CONFLICT(word) DO UPDATE SET sentence=excluded.sentence, phonetic=excluded.phonetic ''', (word, phonetic, sentence)) conn.commit() conn.close() # 清理 GPT 生成句子的雜訊 def clean_sentence(output): output = output.split(":")[-1].strip() output = re.sub(r"^\d+\.\s*", "", output).strip() output = re.sub(r"Write.*?beginners\.", "", output, flags=re.IGNORECASE).strip() output = re.sub(r"\*\*?\d+\.*\*\*", "", output).strip() if not output.endswith("."): output += "." return output # 核心:抽單字 + 查句庫 or GPT 生成句子 def get_words_with_sentences(source, n): status = [] display_result = "" try: # 讀取單字庫 data_path = os.path.join(DATA_DIR, f"{source}.json") with open(data_path, 'r', encoding='utf-8') as f: words = json.load(f) # 隨機抽取 n 個單字 selected_words = random.sample(words, n) results = [] for i, word_data in enumerate(selected_words): word = word_data['word'] phonetic = word_data['phonetic'] # 查詢句庫,看是否已有例句 cached_result = get_sentence(word) if cached_result: sentence = cached_result[2] status.append(f"✅ {word} 已有例句,從句庫讀取") else: # 沒有的話,GPT 生成句子 status.append(f"📝 正在生成第 {i + 1}/{n} 個單字 [{word}] 例句...") prompt = f"A simple English sentence with the word '{word}':" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=30) sentence = tokenizer.decode(outputs[0], skip_special_tokens=True) # 清理生成句子 sentence = clean_sentence(sentence) # 存入句庫 save_sentence(word, phonetic, sentence) # 美化輸出 display_result += f"""

📖 單字: {word}

🔤 音標: {phonetic}

✍️ 例句: {sentence}

""" status.append("✅ 完成!") return display_result, "\n".join(status) except Exception as e: status.append(f"❌ 發生錯誤: {str(e)}") return f"

發生錯誤:{str(e)}

", "\n".join(status) # 啟動時自動建表 init_db()