import re import json from collections import defaultdict # KNOWN_CONFERENCE_NAMES = ["COLING", "COLM", "EACL", "NAACL", "EMNLP", "AACL", "ACL"] # NOTE: NAACL and EACL need to come earlier than ACL CONFERENCE_NAME_TO_ABBR = { "Conference on Dependency Linguistics": "DepLing", "Conference on Language Modeling": "COLM", "European Chapter of the Association for Computational Linguistics": "EACL", "North American Chapter of the Association for Computational Linguistics": "NAACL", "Empirical Methods in Natural Language Processing": "EMNLP", "Association for Computational Linguistics": "ACL", "Annual Meeting of the Association for Computational Linguistics": "ACL", "International Workshop on Health Text Mining and Information Analysis": "LUOHI", "Conference on Computational Semantics": "IWCS", "Conference on Machine Translation": "WMT", "Conference Recent Advances in Natural Language Processing": "RANLP", "Conference on Computational Linguistics": "COLING", "Conference of Computational Linguistics": "NODALIDA", "Conference on Language Resources and Evaluation": "LREC", } UNNEEDED_DESCRIPTIONS = [ "Shared Task", "Short Papers", "Poster Papers", "Poster", ] KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values()) def extract_conference_info(): """Extract unique conferences from conferences.txt and save to JSON""" # Read the conferences file with open('data/conferences.txt', 'r', encoding='utf-8') as f: content = f.read() # Split by lines and clean up all_lines = [line.strip() for line in content.split('\n') if line.strip()] lines = list(set(all_lines)) # Dictionary to store unique conferences with their years conferences = defaultdict(set) abbr2count = defaultdict(int) for line in lines: # Remove leading/trailing braces and clean up _count = all_lines.count(line) line = line.strip(' \{\},') # line = line.replace("{", "") # line = line.replace("}", "") # line = line.strip() # Skip empty lines if not line: continue # Extract year from the conference name year_match = re.search(r'\b(19|20)\d{2}\b', line) year = year_match.group() if year_match else None # Extract the base conference name (remove year and common suffixes) # Remove year from the name for grouping base_name = re.sub(r'\b(19|20)\d{2}\b', '', line) # Remove common suffixes that don't affect the core conference name if base_name.startswith("Findings"): base_name = base_name.split(":")[-1].strip() else: base_name = re.sub(r'\s*:\s*.*$', '', base_name) # Remove everything after colon base_name = re.sub(r'\s*--\s*.*$', '', base_name) # Remove everything after double dash # remove everything within parentheses base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) # Remove trailing parentheses base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) # Remove trailing parentheses base_name = re.sub(r'\s*Volume\s+\d+.*$', '', base_name, flags=re.IGNORECASE) # Remove volume info base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE) # Remove "Proceedings of" # Remove ordinal numbers (1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, 11th, 12th, etc.) base_name = re.sub(r'\b\d+(?:st|nd|rd|th)\s+', '', base_name, flags=re.IGNORECASE) base_name = base_name.replace("}", "") # Remove any words before the first occurrence of "Conference" conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE) if conference_match: start_pos = conference_match.start() base_name = base_name[start_pos:] # Remove "the First", "the Second", etc. from the beginning base_name = re.sub(r'^the\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|Fifteenth|Sixteenth|Seventeenth|Eighteenth|Nineteenth|Twentieth|Twenty-first|Twenty-second|Twenty-third|Twenty-fourth|Twenty-fifth|Twenty-sixth|Twenty-seventh|Twenty-eighth|Twenty-ninth|Thirtieth|Thirty-first|Thirty-second|Thirty-third|Thirty-fourth|Thirty-fifth|Thirty-sixth|Thirty-seventh|Thirty-eighth|Thirty-ninth|Fortieth|Forty-first|Forty-second|Forty-third|Forty-fourth|Forty-fifth|Forty-sixth|Forty-seventh|Forty-eighth|Forty-ninth|Fiftieth|Fifty-first|Fifty-second|Fifty-third|Fifty-fourth|Fifty-fifth|Fifty-sixth|Fifty-seventh|Fifty-eighth|Fifty-ninth|Sixtieth|Sixty-first|Sixty-second|Sixty-third|Sixty-fourth|Sixty-fifth|Sixty-sixth|Sixty-seventh|Sixty-eighth|Sixty-ninth|Seventieth|Seventy-first|Seventy-second|Seventy-third|Seventy-fourth|Seventy-fifth|Seventy-sixth|Seventy-seventh|Seventy-eighth|Seventy-ninth|Eightieth|Eighty-first|Eighty-second|Eighty-third|Eighty-fourth|Eighty-fifth|Eighty-sixth|Eighty-seventh|Eighty-eighth|Eighty-ninth|Ninetieth|Ninety-first|Ninety-second|Ninety-third|Ninety-fourth|Ninety-fifth|Ninety-sixth|Ninety-seventh|Ninety-eighth|Ninety-ninth|Hundredth)\s+', '', base_name, flags=re.IGNORECASE) # Remove Roman numerals (I, II, III, IV, V, VI, VII, VIII, IX, X, XI, XII, XIII, XIV, XV, XVI, XVII, XVIII, XIX, XX, etc.) # This needs to happen BEFORE punctuation removal to catch Roman numerals properly # More comprehensive pattern to catch all Roman numerals base_name = re.sub(r'\b(?:I{1,3}|IV|VI{0,3}|IX|X{1,3}|XI{0,3}|XV|XX{0,3}|XXX{0,3}|XL|L|LX{0,3}|LXX{0,3}|LXXX|XC|C|CC{0,3}|CD|D|DC{0,3}|DCC{0,3}|DCCC|CM|M{0,3})\b', '', base_name) # Also try a simpler approach - remove any sequence of I, V, X, L, C, D, M that looks like a Roman numeral base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name) # Replace punctuation with whitespace base_name = re.sub(r'[^\w\s]', ' ', base_name) # Replace all numbers with whitespace base_name = re.sub(r'\d+', ' ', base_name) # base_name = base_name.replace("Shared Task ", "") for unneeded_description in UNNEEDED_DESCRIPTIONS: base_name = base_name.replace(unneeded_description, "") for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items(): if conf_name.lower() in base_name.lower(): base_name = base_name.replace(conf_name, conf_abbr) break for conf in KNOWN_CONFERENCE_NAMES: if conf.lower() in base_name.lower(): base_name = conf break if "de la" in base_name or " le " in base_name or base_name == "Conference": base_name = "Others" if "Multi lingual" in base_name: base_name = base_name.replace("Multi lingual", "Multilingual") # Clean up extra whitespace and consecutive whitespace base_name = re.sub(r'\s+', ' ', base_name).strip() # Skip if base name is too short # if len(base_name) < 5: # base_name = "Unknown" # Add to conferences dictionary # if year: # conferences[base_name].add(int(year)) # else: # conferences[base_name].add(None) conferences[base_name].add(line) abbr2count[base_name] += _count # conf_abbr2keywords = { # "ACL": ["Association for Computational Linguistics"], # "EMNLP": ["Empirical Methods in Natural Language Processing"], # "NAACL": ["North American Chapter of the Association for Computational Linguistics"], # "EACL": ["European Chapter of the Association for Computational Linguistics"], # "COLM": ["Conference on Computational Linguistics"], # } # print(f"Found {len(conferences)} unique conferences from {len(lines)} lines") # for i, conf in enumerate(sorted(conferences.keys())): # print(f"{i+1}. {conf}") # if i > 200: break # import pdb; pdb.set_trace() conference_to_save = {} others = [] for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)): ratio = count / len(all_lines) if ratio < 0.001 or conf == "Others": others.append((conf, count)) continue conference_to_save[conf] = { "count": count, "conferences": tuple(conferences[conf]), } print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]") conference_to_save[f"Others ({len(others)} Venues)"] = { "count": sum(count for conf, count in others), "conferences": tuple(conf for conf, count in others), } # Save to JSON file with open('data/unique_conferences.json', 'w', encoding='utf-8') as f: json.dump(conference_to_save, f, indent=2, ensure_ascii=False) print(f"Extracted {len(conference_to_save)} unique conferences") print(f"Saved to data/unique_conferences.json") if __name__ == "__main__": extract_conference_info()