Spaces:

crystina-z
/

multilingual-paperbase

Running

File size: 9,106 Bytes

0a97af6

import re
import json
from collections import defaultdict


# KNOWN_CONFERENCE_NAMES = ["COLING", "COLM", "EACL", "NAACL", "EMNLP", "AACL", "ACL"] # NOTE: NAACL and EACL need to come earlier than ACL
CONFERENCE_NAME_TO_ABBR = {
    "Conference on Dependency Linguistics": "DepLing",
    "Conference on Language Modeling": "COLM",
    "European Chapter of the Association for Computational Linguistics": "EACL",
    "North American Chapter of the Association for Computational Linguistics": "NAACL",
    "Empirical Methods in Natural Language Processing": "EMNLP",
    "Association for Computational Linguistics": "ACL",
    "Annual Meeting of the Association for Computational Linguistics": "ACL",
    "International Workshop on Health Text Mining and Information Analysis": "LUOHI",
    "Conference on Computational Semantics": "IWCS",
    "Conference on Machine Translation": "WMT",
    "Conference Recent Advances in Natural Language Processing": "RANLP",
    "Conference on Computational Linguistics": "COLING",
    "Conference of Computational Linguistics": "NODALIDA",
    "Conference on Language Resources and Evaluation": "LREC",
}

UNNEEDED_DESCRIPTIONS = [
    "Shared Task",
    "Short Papers",
    "Poster Papers",
    "Poster",
]

KNOWN_CONFERENCE_NAMES = list(CONFERENCE_NAME_TO_ABBR.values())

def extract_conference_info():
    """Extract unique conferences from conferences.txt and save to JSON"""
    
    # Read the conferences file
    with open('data/conferences.txt', 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split by lines and clean up
    all_lines = [line.strip() for line in content.split('\n') if line.strip()]
    lines = list(set(all_lines))

    # Dictionary to store unique conferences with their years
    conferences = defaultdict(set)
    abbr2count = defaultdict(int)

    for line in lines:
        # Remove leading/trailing braces and clean up
        _count = all_lines.count(line)

        line = line.strip(' \{\},')
        # line = line.replace("{", "")
        # line = line.replace("}", "")
        # line = line.strip()

        # Skip empty lines
        if not line:
            continue
            
        # Extract year from the conference name
        year_match = re.search(r'\b(19|20)\d{2}\b', line)
        year = year_match.group() if year_match else None
        
        # Extract the base conference name (remove year and common suffixes)
        # Remove year from the name for grouping
        base_name = re.sub(r'\b(19|20)\d{2}\b', '', line)
        
        # Remove common suffixes that don't affect the core conference name
        if base_name.startswith("Findings"):
            base_name = base_name.split(":")[-1].strip()
        else:
            base_name = re.sub(r'\s*:\s*.*$', '', base_name)  # Remove everything after colon
            base_name = re.sub(r'\s*--\s*.*$', '', base_name)  # Remove everything after double dash
        # remove everything within parentheses
        base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name)  # Remove trailing parentheses

        base_name = re.sub(r'\s*\([^)]*\)\s*$', '', base_name)  # Remove trailing parentheses
        base_name = re.sub(r'\s*Volume\s+\d+.*$', '', base_name, flags=re.IGNORECASE)  # Remove volume info
        base_name = re.sub(r'\s*Proceedings\s+of\s+', '', base_name, flags=re.IGNORECASE)  # Remove "Proceedings of"

        # Remove ordinal numbers (1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th, 10th, 11th, 12th, etc.)
        base_name = re.sub(r'\b\d+(?:st|nd|rd|th)\s+', '', base_name, flags=re.IGNORECASE)
        base_name = base_name.replace("}", "")

        # Remove any words before the first occurrence of "Conference"
        conference_match = re.search(r'\bConference\b', base_name, re.IGNORECASE)
        if conference_match:
            start_pos = conference_match.start()
            base_name = base_name[start_pos:]

        # Remove "the First", "the Second", etc. from the beginning
        base_name = re.sub(r'^the\s+(?:First|Second|Third|Fourth|Fifth|Sixth|Seventh|Eighth|Ninth|Tenth|Eleventh|Twelfth|Thirteenth|Fourteenth|Fifteenth|Sixteenth|Seventeenth|Eighteenth|Nineteenth|Twentieth|Twenty-first|Twenty-second|Twenty-third|Twenty-fourth|Twenty-fifth|Twenty-sixth|Twenty-seventh|Twenty-eighth|Twenty-ninth|Thirtieth|Thirty-first|Thirty-second|Thirty-third|Thirty-fourth|Thirty-fifth|Thirty-sixth|Thirty-seventh|Thirty-eighth|Thirty-ninth|Fortieth|Forty-first|Forty-second|Forty-third|Forty-fourth|Forty-fifth|Forty-sixth|Forty-seventh|Forty-eighth|Forty-ninth|Fiftieth|Fifty-first|Fifty-second|Fifty-third|Fifty-fourth|Fifty-fifth|Fifty-sixth|Fifty-seventh|Fifty-eighth|Fifty-ninth|Sixtieth|Sixty-first|Sixty-second|Sixty-third|Sixty-fourth|Sixty-fifth|Sixty-sixth|Sixty-seventh|Sixty-eighth|Sixty-ninth|Seventieth|Seventy-first|Seventy-second|Seventy-third|Seventy-fourth|Seventy-fifth|Seventy-sixth|Seventy-seventh|Seventy-eighth|Seventy-ninth|Eightieth|Eighty-first|Eighty-second|Eighty-third|Eighty-fourth|Eighty-fifth|Eighty-sixth|Eighty-seventh|Eighty-eighth|Eighty-ninth|Ninetieth|Ninety-first|Ninety-second|Ninety-third|Ninety-fourth|Ninety-fifth|Ninety-sixth|Ninety-seventh|Ninety-eighth|Ninety-ninth|Hundredth)\s+', '', base_name, flags=re.IGNORECASE)
        
        # Remove Roman numerals (I, II, III, IV, V, VI, VII, VIII, IX, X, XI, XII, XIII, XIV, XV, XVI, XVII, XVIII, XIX, XX, etc.)
        # This needs to happen BEFORE punctuation removal to catch Roman numerals properly
        # More comprehensive pattern to catch all Roman numerals
        base_name = re.sub(r'\b(?:I{1,3}|IV|VI{0,3}|IX|X{1,3}|XI{0,3}|XV|XX{0,3}|XXX{0,3}|XL|L|LX{0,3}|LXX{0,3}|LXXX|XC|C|CC{0,3}|CD|D|DC{0,3}|DCC{0,3}|DCCC|CM|M{0,3})\b', '', base_name)
        
        # Also try a simpler approach - remove any sequence of I, V, X, L, C, D, M that looks like a Roman numeral
        base_name = re.sub(r'\b[IVXLCDM]+\b', '', base_name)
        
        # Replace punctuation with whitespace
        base_name = re.sub(r'[^\w\s]', ' ', base_name)

        # Replace all numbers with whitespace 
        base_name = re.sub(r'\d+', ' ', base_name)

        # base_name = base_name.replace("Shared Task ", "")
        for unneeded_description in UNNEEDED_DESCRIPTIONS:
            base_name = base_name.replace(unneeded_description, "")

        for conf_name, conf_abbr in CONFERENCE_NAME_TO_ABBR.items():
            if conf_name.lower() in base_name.lower():
                base_name = base_name.replace(conf_name, conf_abbr)
                break

        for conf in KNOWN_CONFERENCE_NAMES:
            if conf.lower() in base_name.lower():
                base_name = conf
                break

        if "de la" in base_name or " le " in base_name or base_name == "Conference":
            base_name = "Others"

        if "Multi lingual" in base_name:
            base_name = base_name.replace("Multi lingual", "Multilingual")

        # Clean up extra whitespace and consecutive whitespace
        base_name = re.sub(r'\s+', ' ', base_name).strip()

        # Skip if base name is too short
        # if len(base_name) < 5:
        #     base_name = "Unknown"

        # Add to conferences dictionary
        # if year:
        #     conferences[base_name].add(int(year))
        # else:
        #     conferences[base_name].add(None)
        conferences[base_name].add(line)
        abbr2count[base_name] += _count

    # conf_abbr2keywords = {
    #     "ACL": ["Association for Computational Linguistics"],
    #     "EMNLP": ["Empirical Methods in Natural Language Processing"],
    #     "NAACL": ["North American Chapter of the Association for Computational Linguistics"],
    #     "EACL": ["European Chapter of the Association for Computational Linguistics"],
    #     "COLM": ["Conference on Computational Linguistics"],
    # }

    # print(f"Found {len(conferences)} unique conferences from {len(lines)} lines")
    # for i, conf in enumerate(sorted(conferences.keys())):
    #     print(f"{i+1}. {conf}")
    #     if i > 200: break
    # import pdb; pdb.set_trace()

    conference_to_save = {}
    others = []
    for i, (conf, count) in enumerate(sorted(abbr2count.items(), key=lambda x: x[1], reverse=True)):
        ratio = count / len(all_lines)
        if ratio < 0.001 or conf == "Others":
            others.append((conf, count))
            continue

        conference_to_save[conf] = {
            "count": count,
            "conferences": tuple(conferences[conf]),
        }
        print(f"{i+1}. {conf}: {count} [{ratio * 100:.1f}%]")

    conference_to_save[f"Others ({len(others)} Venues)"] = {
        "count": sum(count for conf, count in others),
        "conferences": tuple(conf for conf, count in others),
    }

    # Save to JSON file
    with open('data/unique_conferences.json', 'w', encoding='utf-8') as f:
        json.dump(conference_to_save, f, indent=2, ensure_ascii=False)

    print(f"Extracted {len(conference_to_save)} unique conferences")
    print(f"Saved to data/unique_conferences.json")
    
if __name__ == "__main__":
    extract_conference_info()