Spaces:

Shuu12121
/

CodeSearch-ModernBERT-Owl-Demo

Sleeping

App Files Files Community

Shuu12121 commited on Apr 16

Commit

6396265

verified ·

1 Parent(s): 066cc0b

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -4

app.py CHANGED Viewed

@@ -25,22 +25,40 @@ def code_search_demo(seed: int):
     code_str, doc_str = get_random_query(seed)
     query_emb = model.encode(doc_str, convert_to_tensor=True)
-    # ランダムに10件取得
     candidates = dataset.shuffle(seed=seed).select(range(10))
     candidate_codes = [c["code"] for c in candidates]
     candidate_embeddings = model.encode(candidate_codes, convert_to_tensor=True)
-    # 類似度スコア算出
     cos_scores = util.cos_sim(query_emb, candidate_embeddings)[0]
     results = sorted(zip(candidate_codes, cos_scores), key=lambda x: x[1], reverse=True)
-    # 結果出力
     output = f"### 🔍 Query Docstring\n\n{doc_str}\n\n"
     output += "## 🏆 Top Matches:\n"
     medals = ["🥇", "🥈", "🥉"] + [f"#{i+1}" for i in range(3, len(results))]
     for i, (code, score) in enumerate(results):
         label = medals[i] if i < len(medals) else f"#{i+1}"
-        output += f"\n**{label}** - Similarity: {score.item():.4f}\n\n```python\n{code.strip()[:1000]}\n```\n"
     return output

     code_str, doc_str = get_random_query(seed)
     query_emb = model.encode(doc_str, convert_to_tensor=True)
+    # ランダムに10件取得し、正解 index を含めるようにする（※現実には全件評価がおすすめ）
     candidates = dataset.shuffle(seed=seed).select(range(10))
+    correct_label = dataset[seed]["label"]  # 正解 index（全体に対する）
+    correct_code = dataset[correct_label]["code"]
     candidate_codes = [c["code"] for c in candidates]
     candidate_embeddings = model.encode(candidate_codes, convert_to_tensor=True)
     cos_scores = util.cos_sim(query_emb, candidate_embeddings)[0]
     results = sorted(zip(candidate_codes, cos_scores), key=lambda x: x[1], reverse=True)
+    # 正解コードが Top-K に含まれているかを確認
+    top_k = 10
+    correct_in_top_k = any(code.strip() == correct_code.strip() for code, _ in results[:top_k])
+    mrr = 0.0
+    for rank, (code, _) in enumerate(results, start=1):
+        if code.strip() == correct_code.strip():
+            mrr = 1.0 / rank
+            break
+    # 出力構築
     output = f"### 🔍 Query Docstring\n\n{doc_str}\n\n"
+    output += f"**✅ 正解は Top-{top_k} に含まれているか？**: {'🟢 Yes' if correct_in_top_k else '🔴 No'}\n\n"
+    output += f"**📈 MRR@{top_k}**: {mrr:.4f}\n\n"
     output += "## 🏆 Top Matches:\n"
     medals = ["🥇", "🥈", "🥉"] + [f"#{i+1}" for i in range(3, len(results))]
     for i, (code, score) in enumerate(results):
         label = medals[i] if i < len(medals) else f"#{i+1}"
+        is_correct = "✅" if code.strip() == correct_code.strip() else ""
+        output += f"\n**{label}** - Similarity: {score.item():.4f} {is_correct}\n\n```python\n{code.strip()[:1000]}\n```\n"
+    return output
     return output