File size: 9,106 Bytes
0a97af6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import re
import json
from collections import defaultdict


# KNOWN_CONFERENCE_NAMES = ["COLING", "COLM", "EACL", "NAACL", "EMNLP", "AACL", "ACL"] # NOTE: NAACL and EACL need to come earlier than ACL
CONFERENCE_NAME_TO_ABBR = {
    "Conference on Dependency Linguistics": "DepLing",
    "Conference on Language Modeling": "COLM",
    "European Chapter of the Association for Computational Linguistics": "EACL",
    "North American Chapter of the Association for Computational Linguistics": "NAACL",
    "Empirical Methods in Natural Language Processing": "EMNLP",
    "Association for Computational Linguistics": "ACL",
    "Annual Meeting of the Association for Computational Linguistics": "ACL",
    "International Workshop on Health Text Mining and Information Analysis": "LUOHI",
    "Conference on Computational Semantics": "IWCS",
    "Conference on Machine Translation": "WMT",
    "Conference Recent Advances in Natural Language Processing": "RANLP",
    "Conference on Computational Linguistics": "COLING",
    "Conference of Computational Linguistics": "NODALIDA",
    "Conference on Language Resources and Evaluation": "LREC",
}

UNNEEDED_DESCRIPTIONS = [
    "Shared Task",
    "Short Papers",
    "Poster Papers",
    "Poster",
]

KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values())

def extract_conference_info():
    """Extract unique conferences from conferences.txt and save to JSON"""
    
    # Read the conferences file
    with open('data/conferences.txt', 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split by lines and clean up
    all_lines = [line.strip() for line in content.split('\n') if line.strip()]
    lines = list(set(all_lines))

    # Dictionary to store unique conferences with their years
    conferences = defaultdict(set)
    abbr2count = defaultdict(int)

    for line in lines:
        # Remove leading/trailing braces and clean up
        _count = all_lines.count(line)

        line = line.strip(' \{\},')
        # line = line.replace("{", "")
        # line = line.replace("}", "")
        # line = line.strip()

        # Skip empty lines
        if not line:
            continue
            
        # Extract year from the conference name
        year_match = re.search(r'\b(19|20)\d{2}\b', line)
        year = year_match.group() if year_match else None
        
        # Extract the base conference name (remove year and common suffixes)
        # Remove year from the name for grouping
        base_name = re.sub(r'\b(19|20)\d{2}\b', '', line)
        
        # Remove common suffixes that don't affect the core conference name
        if base_name.startswith("Findings"):
            base_name = base_name.split(":")[-1].strip()
        else:
            base_name = re.sub(r'\s*:\s*.*$', '', base_name)  # Remove everything after colon
            base_name = re.sub(r'\s*--\s*.*$', '', base_name)  # Remove everything after double dash
        # remove everything within parentheses
        base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name)  # Remove trailing parentheses

        base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name)  # Remove trailing parentheses
        base_name = re.sub(r'\s*Volume\s+\d+.*$', '', base_name, flags=re.IGNORECASE)  # Remove volume info
        base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE)  # Remove "Proceedings of"

        # Remove ordinal numbers (1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, 11th, 12th, etc.)
        base_name = re.sub(r'\b\d+(?:st|nd|rd|th)\s+', '', base_name, flags=re.IGNORECASE)
        base_name = base_name.replace("}", "")

        # Remove any words before the first occurrence of "Conference"
        conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE)
        if conference_match:
            start_pos = conference_match.start()
            base_name = base_name[start_pos:]

        # Remove "the First", "the Second", etc. from the beginning
        base_name = re.sub(r'^the\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|Fifteenth|Sixteenth|Seventeenth|Eighteenth|Nineteenth|Twentieth|Twenty-first|Twenty-second|Twenty-third|Twenty-fourth|Twenty-fifth|Twenty-sixth|Twenty-seventh|Twenty-eighth|Twenty-ninth|Thirtieth|Thirty-first|Thirty-second|Thirty-third|Thirty-fourth|Thirty-fifth|Thirty-sixth|Thirty-seventh|Thirty-eighth|Thirty-ninth|Fortieth|Forty-first|Forty-second|Forty-third|Forty-fourth|Forty-fifth|Forty-sixth|Forty-seventh|Forty-eighth|Forty-ninth|Fiftieth|Fifty-first|Fifty-second|Fifty-third|Fifty-fourth|Fifty-fifth|Fifty-sixth|Fifty-seventh|Fifty-eighth|Fifty-ninth|Sixtieth|Sixty-first|Sixty-second|Sixty-third|Sixty-fourth|Sixty-fifth|Sixty-sixth|Sixty-seventh|Sixty-eighth|Sixty-ninth|Seventieth|Seventy-first|Seventy-second|Seventy-third|Seventy-fourth|Seventy-fifth|Seventy-sixth|Seventy-seventh|Seventy-eighth|Seventy-ninth|Eightieth|Eighty-first|Eighty-second|Eighty-third|Eighty-fourth|Eighty-fifth|Eighty-sixth|Eighty-seventh|Eighty-eighth|Eighty-ninth|Ninetieth|Ninety-first|Ninety-second|Ninety-third|Ninety-fourth|Ninety-fifth|Ninety-sixth|Ninety-seventh|Ninety-eighth|Ninety-ninth|Hundredth)\s+', '', base_name, flags=re.IGNORECASE)
        
        # Remove Roman numerals (I, II, III, IV, V, VI, VII, VIII, IX, X, XI, XII, XIII, XIV, XV, XVI, XVII, XVIII, XIX, XX, etc.)
        # This needs to happen BEFORE punctuation removal to catch Roman numerals properly
        # More comprehensive pattern to catch all Roman numerals
        base_name = re.sub(r'\b(?:I{1,3}|IV|VI{0,3}|IX|X{1,3}|XI{0,3}|XV|XX{0,3}|XXX{0,3}|XL|L|LX{0,3}|LXX{0,3}|LXXX|XC|C|CC{0,3}|CD|D|DC{0,3}|DCC{0,3}|DCCC|CM|M{0,3})\b', '', base_name)
        
        # Also try a simpler approach - remove any sequence of I, V, X, L, C, D, M that looks like a Roman numeral
        base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name)
        
        # Replace punctuation with whitespace
        base_name = re.sub(r'[^\w\s]', ' ', base_name)

        # Replace all numbers with whitespace 
        base_name = re.sub(r'\d+', ' ', base_name)

        # base_name = base_name.replace("Shared Task ", "")
        for unneeded_description in UNNEEDED_DESCRIPTIONS:
            base_name = base_name.replace(unneeded_description, "")

        for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items():
            if conf_name.lower() in base_name.lower():
                base_name = base_name.replace(conf_name, conf_abbr)
                break

        for conf in KNOWN_CONFERENCE_NAMES:
            if conf.lower() in base_name.lower():
                base_name = conf
                break

        if "de la" in base_name or " le " in base_name or base_name == "Conference":
            base_name = "Others"

        if "Multi lingual" in base_name:
            base_name = base_name.replace("Multi lingual", "Multilingual")

        # Clean up extra whitespace and consecutive whitespace
        base_name = re.sub(r'\s+', ' ', base_name).strip()

        # Skip if base name is too short
        # if len(base_name) < 5:
        #     base_name = "Unknown"

        # Add to conferences dictionary
        # if year:
        #     conferences[base_name].add(int(year))
        # else:
        #     conferences[base_name].add(None)
        conferences[base_name].add(line)
        abbr2count[base_name] += _count

    # conf_abbr2keywords = {
    #     "ACL": ["Association for Computational Linguistics"],
    #     "EMNLP": ["Empirical Methods in Natural Language Processing"],
    #     "NAACL": ["North American Chapter of the Association for Computational Linguistics"],
    #     "EACL": ["European Chapter of the Association for Computational Linguistics"],
    #     "COLM": ["Conference on Computational Linguistics"],
    # }

    # print(f"Found {len(conferences)} unique conferences from {len(lines)} lines")
    # for i, conf in enumerate(sorted(conferences.keys())):
    #     print(f"{i+1}. {conf}")
    #     if i > 200: break
    # import pdb; pdb.set_trace()

    conference_to_save = {}
    others = []
    for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)):
        ratio = count / len(all_lines)
        if ratio < 0.001 or conf == "Others":
            others.append((conf, count))
            continue

        conference_to_save[conf] = {
            "count": count,
            "conferences": tuple(conferences[conf]),
        }
        print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]")

    conference_to_save[f"Others ({len(others)} Venues)"] = {
        "count": sum(count for conf, count in others),
        "conferences": tuple(conf for conf, count in others),
    }

    # Save to JSON file
    with open('data/unique_conferences.json', 'w', encoding='utf-8') as f:
        json.dump(conference_to_save, f, indent=2, ensure_ascii=False)

    print(f"Extracted {len(conference_to_save)} unique conferences")
    print(f"Saved to data/unique_conferences.json")
    
if __name__ == "__main__":
    extract_conference_info()