File size: 9,106 Bytes
0a97af6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import re
import json
from collections import defaultdict
# KNOWN_CONFERENCE_NAMES = ["COLING", "COLM", "EACL", "NAACL", "EMNLP", "AACL", "ACL"] # NOTE: NAACL and EACL need to come earlier than ACL
CONFERENCE_NAME_TO_ABBR = {
"Conference on Dependency Linguistics": "DepLing",
"Conference on Language Modeling": "COLM",
"European Chapter of the Association for Computational Linguistics": "EACL",
"North American Chapter of the Association for Computational Linguistics": "NAACL",
"Empirical Methods in Natural Language Processing": "EMNLP",
"Association for Computational Linguistics": "ACL",
"Annual Meeting of the Association for Computational Linguistics": "ACL",
"International Workshop on Health Text Mining and Information Analysis": "LUOHI",
"Conference on Computational Semantics": "IWCS",
"Conference on Machine Translation": "WMT",
"Conference Recent Advances in Natural Language Processing": "RANLP",
"Conference on Computational Linguistics": "COLING",
"Conference of Computational Linguistics": "NODALIDA",
"Conference on Language Resources and Evaluation": "LREC",
}
UNNEEDED_DESCRIPTIONS = [
"Shared Task",
"Short Papers",
"Poster Papers",
"Poster",
]
KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values())
def extract_conference_info():
"""Extract unique conferences from conferences.txt and save to JSON"""
# Read the conferences file
with open('data/conferences.txt', 'r', encoding='utf-8') as f:
content = f.read()
# Split by lines and clean up
all_lines = [line.strip() for line in content.split('\n') if line.strip()]
lines = list(set(all_lines))
# Dictionary to store unique conferences with their years
conferences = defaultdict(set)
abbr2count = defaultdict(int)
for line in lines:
# Remove leading/trailing braces and clean up
_count = all_lines.count(line)
line = line.strip(' \{\},')
# line = line.replace("{", "")
# line = line.replace("}", "")
# line = line.strip()
# Skip empty lines
if not line:
continue
# Extract year from the conference name
year_match = re.search(r'\b(19|20)\d{2}\b', line)
year = year_match.group() if year_match else None
# Extract the base conference name (remove year and common suffixes)
# Remove year from the name for grouping
base_name = re.sub(r'\b(19|20)\d{2}\b', '', line)
# Remove common suffixes that don't affect the core conference name
if base_name.startswith("Findings"):
base_name = base_name.split(":")[-1].strip()
else:
base_name = re.sub(r'\s*:\s*.*$', '', base_name) # Remove everything after colon
base_name = re.sub(r'\s*--\s*.*$', '', base_name) # Remove everything after double dash
# remove everything within parentheses
base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) # Remove trailing parentheses
base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name) # Remove trailing parentheses
base_name = re.sub(r'\s*Volume\s+\d+.*$', '', base_name, flags=re.IGNORECASE) # Remove volume info
base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE) # Remove "Proceedings of"
# Remove ordinal numbers (1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, 11th, 12th, etc.)
base_name = re.sub(r'\b\d+(?:st|nd|rd|th)\s+', '', base_name, flags=re.IGNORECASE)
base_name = base_name.replace("}", "")
# Remove any words before the first occurrence of "Conference"
conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE)
if conference_match:
start_pos = conference_match.start()
base_name = base_name[start_pos:]
# Remove "the First", "the Second", etc. from the beginning
base_name = re.sub(r'^the\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|Fifteenth|Sixteenth|Seventeenth|Eighteenth|Nineteenth|Twentieth|Twenty-first|Twenty-second|Twenty-third|Twenty-fourth|Twenty-fifth|Twenty-sixth|Twenty-seventh|Twenty-eighth|Twenty-ninth|Thirtieth|Thirty-first|Thirty-second|Thirty-third|Thirty-fourth|Thirty-fifth|Thirty-sixth|Thirty-seventh|Thirty-eighth|Thirty-ninth|Fortieth|Forty-first|Forty-second|Forty-third|Forty-fourth|Forty-fifth|Forty-sixth|Forty-seventh|Forty-eighth|Forty-ninth|Fiftieth|Fifty-first|Fifty-second|Fifty-third|Fifty-fourth|Fifty-fifth|Fifty-sixth|Fifty-seventh|Fifty-eighth|Fifty-ninth|Sixtieth|Sixty-first|Sixty-second|Sixty-third|Sixty-fourth|Sixty-fifth|Sixty-sixth|Sixty-seventh|Sixty-eighth|Sixty-ninth|Seventieth|Seventy-first|Seventy-second|Seventy-third|Seventy-fourth|Seventy-fifth|Seventy-sixth|Seventy-seventh|Seventy-eighth|Seventy-ninth|Eightieth|Eighty-first|Eighty-second|Eighty-third|Eighty-fourth|Eighty-fifth|Eighty-sixth|Eighty-seventh|Eighty-eighth|Eighty-ninth|Ninetieth|Ninety-first|Ninety-second|Ninety-third|Ninety-fourth|Ninety-fifth|Ninety-sixth|Ninety-seventh|Ninety-eighth|Ninety-ninth|Hundredth)\s+', '', base_name, flags=re.IGNORECASE)
# Remove Roman numerals (I, II, III, IV, V, VI, VII, VIII, IX, X, XI, XII, XIII, XIV, XV, XVI, XVII, XVIII, XIX, XX, etc.)
# This needs to happen BEFORE punctuation removal to catch Roman numerals properly
# More comprehensive pattern to catch all Roman numerals
base_name = re.sub(r'\b(?:I{1,3}|IV|VI{0,3}|IX|X{1,3}|XI{0,3}|XV|XX{0,3}|XXX{0,3}|XL|L|LX{0,3}|LXX{0,3}|LXXX|XC|C|CC{0,3}|CD|D|DC{0,3}|DCC{0,3}|DCCC|CM|M{0,3})\b', '', base_name)
# Also try a simpler approach - remove any sequence of I, V, X, L, C, D, M that looks like a Roman numeral
base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name)
# Replace punctuation with whitespace
base_name = re.sub(r'[^\w\s]', ' ', base_name)
# Replace all numbers with whitespace
base_name = re.sub(r'\d+', ' ', base_name)
# base_name = base_name.replace("Shared Task ", "")
for unneeded_description in UNNEEDED_DESCRIPTIONS:
base_name = base_name.replace(unneeded_description, "")
for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items():
if conf_name.lower() in base_name.lower():
base_name = base_name.replace(conf_name, conf_abbr)
break
for conf in KNOWN_CONFERENCE_NAMES:
if conf.lower() in base_name.lower():
base_name = conf
break
if "de la" in base_name or " le " in base_name or base_name == "Conference":
base_name = "Others"
if "Multi lingual" in base_name:
base_name = base_name.replace("Multi lingual", "Multilingual")
# Clean up extra whitespace and consecutive whitespace
base_name = re.sub(r'\s+', ' ', base_name).strip()
# Skip if base name is too short
# if len(base_name) < 5:
# base_name = "Unknown"
# Add to conferences dictionary
# if year:
# conferences[base_name].add(int(year))
# else:
# conferences[base_name].add(None)
conferences[base_name].add(line)
abbr2count[base_name] += _count
# conf_abbr2keywords = {
# "ACL": ["Association for Computational Linguistics"],
# "EMNLP": ["Empirical Methods in Natural Language Processing"],
# "NAACL": ["North American Chapter of the Association for Computational Linguistics"],
# "EACL": ["European Chapter of the Association for Computational Linguistics"],
# "COLM": ["Conference on Computational Linguistics"],
# }
# print(f"Found {len(conferences)} unique conferences from {len(lines)} lines")
# for i, conf in enumerate(sorted(conferences.keys())):
# print(f"{i+1}. {conf}")
# if i > 200: break
# import pdb; pdb.set_trace()
conference_to_save = {}
others = []
for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)):
ratio = count / len(all_lines)
if ratio < 0.001 or conf == "Others":
others.append((conf, count))
continue
conference_to_save[conf] = {
"count": count,
"conferences": tuple(conferences[conf]),
}
print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]")
conference_to_save[f"Others ({len(others)} Venues)"] = {
"count": sum(count for conf, count in others),
"conferences": tuple(conf for conf, count in others),
}
# Save to JSON file
with open('data/unique_conferences.json', 'w', encoding='utf-8') as f:
json.dump(conference_to_save, f, indent=2, ensure_ascii=False)
print(f"Extracted {len(conference_to_save)} unique conferences")
print(f"Saved to data/unique_conferences.json")
if __name__ == "__main__":
extract_conference_info()
|