Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,9 @@ import random
|
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
from datasets import load_dataset
|
7 |
from spaces import GPU
|
|
|
|
|
|
|
8 |
|
9 |
# --- Load model ---
|
10 |
model = SentenceTransformer("Shuu12121/CodeSearch-ModernBERT-Owl")
|
@@ -13,23 +16,34 @@ model.eval()
|
|
13 |
# --- Load CodeSearchNet dataset (test split only) ---
|
14 |
dataset = load_dataset("code_x_glue_tc_nl_code_search_adv", trust_remote_code=True, split="test")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
# --- Query & Candidate Generator ---
|
17 |
def get_query_and_candidates(seed: int = 42):
|
18 |
random.seed(seed)
|
19 |
idx = random.randint(0, len(dataset) - 1)
|
20 |
query = dataset[idx]
|
21 |
-
correct_code = query["code"]
|
22 |
doc_str = query["docstring"]
|
23 |
|
24 |
-
# 正例 + ランダム負例(正例を除く)
|
25 |
candidate_pool = [example for i, example in enumerate(dataset) if i != idx]
|
26 |
-
negatives = random.sample(candidate_pool, k=99)
|
27 |
-
candidates = [correct_code] + [neg["code"] for neg in negatives]
|
28 |
random.shuffle(candidates)
|
29 |
|
30 |
return doc_str, correct_code, candidates
|
31 |
|
32 |
|
|
|
33 |
@GPU
|
34 |
def code_search_demo(seed: int):
|
35 |
doc_str, correct_code, candidates = get_query_and_candidates(seed)
|
|
|
5 |
from sentence_transformers import SentenceTransformer, util
|
6 |
from datasets import load_dataset
|
7 |
from spaces import GPU
|
8 |
+
import re
|
9 |
+
|
10 |
+
|
11 |
|
12 |
# --- Load model ---
|
13 |
model = SentenceTransformer("Shuu12121/CodeSearch-ModernBERT-Owl")
|
|
|
16 |
# --- Load CodeSearchNet dataset (test split only) ---
|
17 |
dataset = load_dataset("code_x_glue_tc_nl_code_search_adv", trust_remote_code=True, split="test")
|
18 |
|
19 |
+
def remove_comments_from_code(code: str) -> str:
|
20 |
+
# 複数行コメント(docstring含む)を除去
|
21 |
+
code = re.sub(r'"""[\s\S]*?"""', '', code)
|
22 |
+
code = re.sub(r"'''[\s\S]*?'''", '', code)
|
23 |
+
|
24 |
+
# 単一行コメント(# 以降を除去)
|
25 |
+
code = re.sub(r'#.*', '', code)
|
26 |
+
|
27 |
+
return code
|
28 |
+
|
29 |
+
|
30 |
# --- Query & Candidate Generator ---
|
31 |
def get_query_and_candidates(seed: int = 42):
|
32 |
random.seed(seed)
|
33 |
idx = random.randint(0, len(dataset) - 1)
|
34 |
query = dataset[idx]
|
35 |
+
correct_code = remove_comments_from_code(query["code"]) # 修正
|
36 |
doc_str = query["docstring"]
|
37 |
|
|
|
38 |
candidate_pool = [example for i, example in enumerate(dataset) if i != idx]
|
39 |
+
negatives = random.sample(candidate_pool, k=99)
|
40 |
+
candidates = [correct_code] + [remove_comments_from_code(neg["code"]) for neg in negatives] # 修正
|
41 |
random.shuffle(candidates)
|
42 |
|
43 |
return doc_str, correct_code, candidates
|
44 |
|
45 |
|
46 |
+
|
47 |
@GPU
|
48 |
def code_search_demo(seed: int):
|
49 |
doc_str, correct_code, candidates = get_query_and_candidates(seed)
|