Shuu12121 commited on
Commit
d01b7c9
·
verified ·
1 Parent(s): 6be745c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -4
app.py CHANGED
@@ -5,6 +5,9 @@ import random
5
  from sentence_transformers import SentenceTransformer, util
6
  from datasets import load_dataset
7
  from spaces import GPU
 
 
 
8
 
9
  # --- Load model ---
10
  model = SentenceTransformer("Shuu12121/CodeSearch-ModernBERT-Owl")
@@ -13,23 +16,34 @@ model.eval()
13
  # --- Load CodeSearchNet dataset (test split only) ---
14
  dataset = load_dataset("code_x_glue_tc_nl_code_search_adv", trust_remote_code=True, split="test")
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  # --- Query & Candidate Generator ---
17
  def get_query_and_candidates(seed: int = 42):
18
  random.seed(seed)
19
  idx = random.randint(0, len(dataset) - 1)
20
  query = dataset[idx]
21
- correct_code = query["code"]
22
  doc_str = query["docstring"]
23
 
24
- # 正例 + ランダム負例(正例を除く)
25
  candidate_pool = [example for i, example in enumerate(dataset) if i != idx]
26
- negatives = random.sample(candidate_pool, k=99) # 9件の負例
27
- candidates = [correct_code] + [neg["code"] for neg in negatives]
28
  random.shuffle(candidates)
29
 
30
  return doc_str, correct_code, candidates
31
 
32
 
 
33
  @GPU
34
  def code_search_demo(seed: int):
35
  doc_str, correct_code, candidates = get_query_and_candidates(seed)
 
5
  from sentence_transformers import SentenceTransformer, util
6
  from datasets import load_dataset
7
  from spaces import GPU
8
+ import re
9
+
10
+
11
 
12
  # --- Load model ---
13
  model = SentenceTransformer("Shuu12121/CodeSearch-ModernBERT-Owl")
 
16
  # --- Load CodeSearchNet dataset (test split only) ---
17
  dataset = load_dataset("code_x_glue_tc_nl_code_search_adv", trust_remote_code=True, split="test")
18
 
19
+ def remove_comments_from_code(code: str) -> str:
20
+ # 複数行コメント(docstring含む)を除去
21
+ code = re.sub(r'"""[\s\S]*?"""', '', code)
22
+ code = re.sub(r"'''[\s\S]*?'''", '', code)
23
+
24
+ # 単一行コメント(# 以降を除去)
25
+ code = re.sub(r'#.*', '', code)
26
+
27
+ return code
28
+
29
+
30
  # --- Query & Candidate Generator ---
31
  def get_query_and_candidates(seed: int = 42):
32
  random.seed(seed)
33
  idx = random.randint(0, len(dataset) - 1)
34
  query = dataset[idx]
35
+ correct_code = remove_comments_from_code(query["code"]) # 修正
36
  doc_str = query["docstring"]
37
 
 
38
  candidate_pool = [example for i, example in enumerate(dataset) if i != idx]
39
+ negatives = random.sample(candidate_pool, k=99)
40
+ candidates = [correct_code] + [remove_comments_from_code(neg["code"]) for neg in negatives] # 修正
41
  random.shuffle(candidates)
42
 
43
  return doc_str, correct_code, candidates
44
 
45
 
46
+
47
  @GPU
48
  def code_search_demo(seed: int):
49
  doc_str, correct_code, candidates = get_query_and_candidates(seed)