|
""" |
|
ν
μ€νΈ μ²λ¦¬ κ΄λ ¨ μ νΈλ¦¬ν° ν¨μ λͺ¨μ |
|
- ν
μ€νΈ λΆλ¦¬ λ° μ μ |
|
- ν€μλ μΆμΆ |
|
- Gemini API ν€ ν΅ν© κ΄λ¦¬ μ μ© |
|
""" |
|
|
|
import re |
|
import google.generativeai as genai |
|
import os |
|
import logging |
|
import api_utils |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.INFO) |
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
handler = logging.StreamHandler() |
|
handler.setFormatter(formatter) |
|
logger.addHandler(handler) |
|
|
|
|
|
def get_gemini_model(): |
|
"""api_utilsμμ Gemini λͺ¨λΈ κ°μ Έμ€κΈ° (ν΅ν© κ΄λ¦¬)""" |
|
try: |
|
model = api_utils.get_gemini_model() |
|
if model: |
|
logger.info("Gemini λͺ¨λΈ λ‘λ μ±κ³΅ (api_utils ν΅ν© κ΄λ¦¬)") |
|
return model |
|
else: |
|
logger.warning("μ¬μ© κ°λ₯ν Gemini API ν€κ° μμ΅λλ€.") |
|
return None |
|
except Exception as e: |
|
logger.error(f"Gemini λͺ¨λΈ λ‘λ μ€ν¨: {e}") |
|
return None |
|
|
|
|
|
def clean_and_split(text, only_korean=False): |
|
"""ν
μ€νΈλ₯Ό λΆλ¦¬νκ³ μ μ νλ ν¨μ""" |
|
text = re.sub(r"[()\[\]-]", " ", text) |
|
text = text.replace("/", " ") |
|
|
|
if only_korean: |
|
|
|
|
|
words = re.split(r"[ ,]", text) |
|
cleaned = [] |
|
for word in words: |
|
word = word.strip() |
|
|
|
word = re.sub(r"[^κ°-ν£]", "", word) |
|
if word and len(word) >= 1: |
|
cleaned.append(word) |
|
else: |
|
|
|
|
|
words = re.split(r"[,\s]+", text) |
|
cleaned = [] |
|
for word in words: |
|
word = word.strip() |
|
if word and len(word) >= 1: |
|
cleaned.append(word) |
|
|
|
return cleaned |
|
|
|
def filter_keywords_with_gemini(pairs, gemini_model=None): |
|
"""Gemini AIλ₯Ό μ¬μ©νμ¬ ν€μλ μ‘°ν© νν°λ§ (κ°μ λ²μ ) - API ν€ ν΅ν© κ΄λ¦¬""" |
|
if gemini_model is None: |
|
|
|
gemini_model = get_gemini_model() |
|
|
|
if gemini_model is None: |
|
logger.error("Gemini λͺ¨λΈμ κ°μ Έμ¬ μ μμ΅λλ€. λͺ¨λ ν€μλλ₯Ό μ μ§ν©λλ€.") |
|
|
|
all_keywords = set() |
|
for pair in pairs: |
|
for keyword in pair: |
|
all_keywords.add(keyword) |
|
return list(all_keywords) |
|
|
|
|
|
all_keywords = set() |
|
for pair in pairs: |
|
for keyword in pair: |
|
all_keywords.add(keyword) |
|
|
|
|
|
max_pairs = 50 |
|
pairs_to_process = list(pairs)[:max_pairs] if len(pairs) > max_pairs else pairs |
|
|
|
logger.info(f"νν°λ§ν ν€μλ μ: μ΄ {len(pairs)}κ° μ€ {len(pairs_to_process)}κ° μ²λ¦¬") |
|
|
|
|
|
prompt = ( |
|
"λ€μμ μλΉμκ° κ²μν κ°λ₯μ±μ΄ μλ ν€μλ μ λͺ©λ‘μ
λλ€.\n" |
|
"κ° μμ κ°μ λ¨μ΄ μ‘°ν©μ΄μ§λ§ μμλ§ λ€λ₯Έ κ²½μ°μ
λλ€ (μ: μμ§μ€μ§μ΄ vs μ€μ§μ΄μμ§).\n\n" |
|
"μλμ κΈ°μ€μ λ°λΌ κ° μμμ λ μμ°μ€λ¬μ΄ ν€μλλ₯Ό μ νν΄μ£ΌμΈμ:\n" |
|
"1. μλΉμκ° μΌμμ μΌλ‘ μ¬μ©νλ μμ°μ€λ¬μ΄ ννμ μ°μ μ ννμΈμ.\n" |
|
"2. λ ν€μλκ° λͺ¨λ μμ°μ€λ½κ±°λ μλ―Έκ° μ½κ° λ€λ₯΄λ€λ©΄, λ°λμ λ λ€ μ μ§νμΈμ.\n" |
|
"3. νμ€ν λΉμμ°μ€λ½κ±°λ μ΄μν κ²½μ°μλ§ μ κ±°νμΈμ.\n" |
|
"4. λΆνμ€ν κ²½μ°μλ λ°λμ ν€μλλ₯Ό μ μ§νμΈμ.\n" |
|
"5. μ«μλ μμ΄κ° ν¬ν¨λ ν€μλλ νκΈ λ©μΈ ν€μλκ° μμͺ½μ μ€λ ννλ₯Ό μ ννμΈμ. (μ: '10kg μ€μ§μ΄' λ³΄λ€ 'μ€μ§μ΄ 10kg' μ ν)\n" |
|
"6. κ²μλμ΄ 0μΈ ν€μλλΌλ μΌμμ μΈ ννμ΄λΌλ©΄ κ°λ₯ν μ μ§νμΈμ. λͺ
λ°±νκ² λΉμ μμ μΈ ννλ§ μ κ±°νμΈμ.\n\n" |
|
"μ£Όμ: κΈ°λ³Έμ μΌλ‘ λλΆλΆμ ν€μλλ₯Ό μ μ§νκ³ , λ§€μ° λͺ
ννκ² λΉμμ°μ€λ¬μ΄ κ²λ§ μ κ±°νμΈμ.\n\n" |
|
"κ²°κ³Όλ λ€μ νμμΌλ‘ μ 곡ν΄μ£ΌμΈμ:\n" |
|
"- μ νλ ν€μλ (μ΄μ : μμ°μ€λ¬μ΄ ννμ΄κΈ° λλ¬Έ)\n" |
|
"- μ νλ ν€μλ1, μ νλ ν€μλ2 (μ΄μ : λ λ€ μμ°μ€λ½κ³ μλ―Έκ° μ‘°κΈ λ€λ¦)\n\n" |
|
) |
|
|
|
|
|
formatted = "\n".join([f"- {a}, {b}" for a, b in pairs_to_process]) |
|
full_prompt = prompt + formatted |
|
|
|
try: |
|
|
|
logger.info(f"Gemini API νΈμΆ μμ - {len(pairs_to_process)}κ° ν€μλ μ μ²λ¦¬ μ€...") |
|
|
|
|
|
response = gemini_model.generate_content(full_prompt) |
|
|
|
logger.info("Gemini API μλ΅ μ±κ³΅") |
|
lines = response.text.strip().split("\n") |
|
|
|
|
|
final_keywords = [] |
|
for line in lines: |
|
if line.startswith("-"): |
|
|
|
keywords_part = line.strip("- ").split("(μ΄μ :")[0].strip() |
|
|
|
for kw in keywords_part.split(","): |
|
kw = kw.strip() |
|
if kw: |
|
final_keywords.append(kw) |
|
|
|
|
|
if len(pairs) > max_pairs: |
|
logger.info(f"μΆκ° ν€μλ μ²λ¦¬: λ¨μ {len(pairs) - max_pairs}κ° μμ 첫 λ²μ§Έ ν€μλ μΆκ°") |
|
for pair in list(pairs)[max_pairs:]: |
|
|
|
final_keywords.append(pair[0]) |
|
|
|
|
|
if not final_keywords: |
|
logger.warning("κ²½κ³ : μ νλ ν€μλκ° μμ΄ λͺ¨λ ν€μλλ₯Ό μ μ§ν©λλ€.") |
|
final_keywords = list(all_keywords) |
|
|
|
|
|
corrected_keywords = [] |
|
|
|
|
|
unit_pattern = re.compile(r'(?i)(kg|g|mm|cm|ml|l|리ν°|κ°|ν©|λ°μ€|μΈνΈ|2l|l2)') |
|
number_pattern = re.compile(r'\d+') |
|
|
|
for kw in final_keywords: |
|
|
|
if ' ' in kw: |
|
parts = kw.split() |
|
first_part = parts[0] |
|
|
|
|
|
if (unit_pattern.search(first_part) or number_pattern.search(first_part)) and len(parts) > 1: |
|
|
|
corrected_kw = " ".join(parts[1:] + [first_part]) |
|
logger.info(f"ν€μλ μμ κ°μ μμ : '{kw}' -> '{corrected_kw}'") |
|
corrected_keywords.append(corrected_kw) |
|
else: |
|
corrected_keywords.append(kw) |
|
else: |
|
corrected_keywords.append(kw) |
|
|
|
|
|
specific_fixes = [] |
|
for kw in corrected_keywords: |
|
|
|
l_pattern = re.compile(r'^([0-9]*L) (.+)$', re.IGNORECASE) |
|
match = l_pattern.match(kw) |
|
|
|
if match: |
|
|
|
l_part = match.group(1) |
|
main_part = match.group(2) |
|
fixed_kw = f"{main_part} {l_part}" |
|
logger.info(f"νΉμ ν¨ν΄ μμ : '{kw}' -> '{fixed_kw}'") |
|
specific_fixes.append(fixed_kw) |
|
else: |
|
specific_fixes.append(kw) |
|
|
|
|
|
selected_set = set(specific_fixes) |
|
removed_keywords = all_keywords - selected_set |
|
|
|
|
|
logger.info("\n=== LLMμ μν΄ μ κ±°λ ν€μλ λͺ©λ‘ ===") |
|
for kw in removed_keywords: |
|
logger.info(f" - {kw}") |
|
logger.info(f"μ΄ {len(all_keywords)}κ° μ€ {len(removed_keywords)}κ° μ κ±°λ¨ ({len(selected_set)}κ° μ μ§)\n") |
|
|
|
return specific_fixes |
|
|
|
except Exception as e: |
|
logger.error(f"Gemini μ€λ₯: {e}") |
|
logger.error("μ€λ₯ λ°μμΌλ‘ μΈν΄ λͺ¨λ ν€μλλ₯Ό μ μ§ν©λλ€.") |
|
logger.error(f"μ€λ₯ μ ν: {type(e).__name__}") |
|
import traceback |
|
traceback.print_exc() |
|
|
|
|
|
logger.info(f"μμ λͺ¨λ: {len(all_keywords)}κ° ν€μλ λͺ¨λ μ μ§") |
|
return list(all_keywords) |
|
|
|
def get_search_volume_range(total_volume): |
|
"""μ΄ κ²μλμ κΈ°λ°μΌλ‘ κ²μλ ꡬκ°μ λ°ν""" |
|
if total_volume == 0: |
|
return "100λ―Έλ§" |
|
elif total_volume <= 100: |
|
return "100λ―Έλ§" |
|
elif total_volume <= 1000: |
|
return "1000λ―Έλ§" |
|
elif total_volume <= 2000: |
|
return "2000λ―Έλ§" |
|
elif total_volume <= 5000: |
|
return "5000λ―Έλ§" |
|
elif total_volume <= 10000: |
|
return "10000λ―Έλ§" |
|
else: |
|
return "10000μ΄μ" |