{ "cells": [ { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pyarrow\n", " Downloading pyarrow-19.0.1-cp310-cp310-win_amd64.whl (25.3 MB)\n", " ---------------------------------------- 25.3/25.3 MB 2.4 MB/s eta 0:00:00\n", "Installing collected packages: pyarrow\n", "Successfully installed pyarrow-19.0.1\n", "Note: you may need to restart the kernel to use updated packages.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING: You are using pip version 22.0.4; however, version 25.0.1 is available.\n", "You should consider upgrading via the 'c:\\Users\\Romain\\AppData\\Local\\Programs\\Python\\Python310\\python.exe -m pip install --upgrade pip' command.\n" ] } ], "source": [ "%pip install pyarrow" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import re\n", "import polars as pl" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from __future__ import annotations\n", "import re, csv, pathlib, polars as pl\n", "\n", "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n", "DATASETS = [\n", " \"project\",\n", " \"projectDeliverables\",\n", " \"projectPublications\",\n", " \"reportSummaries\",\n", " \"organization\",\n", " \"euroSciVoc\",\n", " \"topics\",\n", " \"webItem\",\n", " \"webLink\",\n", " \"legalBasis\",\n", "]\n", "OUTDIR = ROOT / \"combined\"\n", "OUTDIR.mkdir(exist_ok=True)\n", "\n", "###############################################################################\n", "# 2. Generic cleaner –– parameterised version of the loop you wrote\n", "###############################################################################\n", "_PROJECT_ID_RE = re.compile(r\"^(?:19|20)\\d{2}\")\n", "_GENERIC_NUM_RE = re.compile(r\"\\d{4}\")\n", "\n", "import csv, pathlib, polars as pl, re\n", "\n", "import csv, re, pathlib\n", "import polars as pl # >=0.20\n", "\n", "import csv, pathlib, re\n", "import polars as pl # ≥ 0.20\n", "\n", "\n", "def _clean_one_file(csv_path: pathlib.Path,\n", " number_regex: re.Pattern[str], dataset: str) -> pl.DataFrame:\n", " \"\"\"\n", " Clean a CORDIS CSV whose long *objective* field sometimes explodes into\n", " extra columns because of stray quotes / semicolons.\n", "\n", " Strategy\n", " --------\n", " * A well-formed row has 21 semicolon-separated columns.\n", " * If we get more than 21 columns we treat columns 16 … -4 as belonging\n", " to *objective* and stitch them back together with a semicolon.\n", " * The last three columns are contentUpdateDate | rcn | grantDoi.\n", " \"\"\"\n", " # ---------- constants --------------------------------------------------\n", " if dataset==\"project\":\n", " EXPECTED_COLS = 20 # final width\n", " TITLE_COL = 3 # 0-based index of *title*\n", " DATE1_COL = 4 # 0-based index of startDate\n", " DATE2_COL = 5 # 0-based index of endDate\n", " OBJECTIVE_COL = 16 # 0-based index of objective\n", " TRAILING_KEEP = 3 # last three fixed columns\n", " elif dataset==\"organization\":\n", " EXPECTED_COLS = 25 # final width\n", " TITLE_COL = 3 # 0-based index of *title*\n", " DATE1_COL = 4 # 0-based index of startDate\n", " DATE2_COL = 5 # 0-based index of endDate\n", " OBJECTIVE_COL = 4 # 0-based index of objective\n", " TRAILING_KEEP = 20 # last three fixed columns\n", " else:\n", " EXPECTED_COLS = 20 # final width\n", " TITLE_COL = 3 # 0-based index of *title*\n", " DATE1_COL = 4 # 0-based index of startDate\n", " DATE2_COL = 5 # 0-based index of endDate\n", " OBJECTIVE_COL = 16 # 0-based index of objective\n", " TRAILING_KEEP = 3 # last three fixed columns\n", "\n", "\n", "\n", " date_rx = re.compile(r\"\\d{4}-\\d{2}-\\d{2}$\")\n", " is_date = lambda s: (s == \"\") or bool(date_rx.match(s))\n", "\n", " tmp_clean = csv_path.with_suffix(\".cleaned.csv\")\n", "\n", " with csv_path.open(encoding=\"utf-8\", newline=\"\") as fin, \\\n", " tmp_clean.open(\"w\", encoding=\"utf-8\", newline=\"\") as fout:\n", "\n", " writer = csv.writer(\n", " fout,\n", " delimiter=\"|\",\n", " quotechar='\"',\n", " quoting=csv.QUOTE_MINIMAL,\n", " lineterminator=\"\\n\",\n", " )\n", "\n", " # ---------- iterate raw lines -------------------------------------\n", " for raw in fin:\n", " #print(raw)\n", " raw = raw.rstrip(\"\\n\")\n", " #print(raw)\n", " cells = raw.split(\";\") # blind split\n", "\n", " # ---- 1️⃣ repair *title* if dates are not where they belong --\n", " if (len(cells) > EXPECTED_COLS) and (not is_date(cells[DATE1_COL]) or not is_date(cells[DATE2_COL])) and dataset==\"project\":\n", " # look for the first position where *two successive* cells\n", " # are both valid dates / nulls\n", " i = DATE1_COL\n", " while i + 1 < len(cells):\n", " if is_date(cells[i]) and is_date(cells[i + 1]):\n", " break\n", " i += 1\n", " else:\n", " # cannot find a valid date pair → give up on this line\n", " continue\n", "\n", " head = cells[:TITLE_COL] # 0 … 2\n", " title = \";\".join(cells[TITLE_COL:i]) # glue spill-over\n", " cells = head + [title] + cells[i:] # rebuild the row\n", " # ---- 2️⃣ repair *objective* overflow ------------------------\n", " if len(cells) > EXPECTED_COLS and (dataset==\"project\" or dataset==\"organization\"):\n", " head = cells[:OBJECTIVE_COL]\n", " tail = cells[-TRAILING_KEEP:]\n", " obj = \";\".join(cells[OBJECTIVE_COL:-TRAILING_KEEP])\n", " cells = head + [obj] + tail\n", " #print(\"here 2\")\n", "\n", " # ---- 3️⃣ pad short rows, skip malformed ---------------------\n", " if len(cells) < EXPECTED_COLS and (dataset==\"project\" or dataset==\"organization\"):\n", " cells.extend([\"\"] * (EXPECTED_COLS - len(cells)))\n", " #print(\"here again\")\n", "\n", " if len(cells) != EXPECTED_COLS and (dataset==\"project\" or dataset==\"organization\"): # still wrong → skip\n", " #print(cells)\n", " continue\n", "\n", " # ---- 4️⃣ cell-level clean-ups -------------------------------\n", " cleaned: list[str] = []\n", " for cell in cells:\n", "\n", " if cell in ('\"\"', \"\"):\n", " cell = \"\"\n", " else:\n", " cell = (cell.replace(\"\\t\", \" \")\n", " .replace('\"\"\"', '\"')\n", " .strip())\n", " if number_regex.fullmatch(cell):\n", " cell = cell.lstrip(\"0\") or \"0\"\n", " cleaned.append(cell.strip('\"'))\n", " cleaned[-1]=cleaned[-1].replace('\"','').replace(',','')\n", " cleaned[0]=cleaned[0].replace('\"','')\n", " writer.writerow(cleaned)\n", "\n", " # ---------- read into Polars (all Utf8) -------------------------------\n", " return pl.read_csv(\n", " tmp_clean,\n", " separator=\"|\",\n", " quote_char='\"',\n", " has_header=True,\n", " infer_schema_length=0,\n", " null_values=[\"\"],\n", " truncate_ragged_lines=True,\n", " )\n", "\n", "\n", "def combine_all_programmes() -> None:\n", " from pathlib import Path\n", " for dataset in DATASETS:\n", " combined: list[pl.DataFrame] = []\n", "\n", " for i,programme_dir in enumerate(ROOT.iterdir()):\n", " if not programme_dir.is_dir():\n", " continue\n", " csv_file = programme_dir / f\"{dataset}.csv\"\n", " if not csv_file.exists():\n", " continue\n", "\n", " regex = _PROJECT_ID_RE if dataset == \"project\" else _GENERIC_NUM_RE\n", " df = _clean_one_file(csv_file, regex, dataset)\n", " print(programme_dir)\n", " # ---------- type coercions matching your original code ----------\n", " if dataset == \"project\":\n", " df = (\n", " df\n", " .with_columns([\n", " pl.col(\"id\"),#.cast(pl.Int64),\n", " pl.col(\"acronym\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"status\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"title\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"legalBasis\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"topics\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"frameworkProgramme\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"masterCall\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"subCall\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"fundingScheme\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"nature\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"objective\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"grantDoi\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n", " pl.col(\"totalCost\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.replace_all('\"','').str.replace(\",\",\".\").cast(pl.Float64),\n", " pl.col(\"ecMaxContribution\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.replace_all('\"','').str.replace(\",\",\".\").cast(pl.Float64),\n", " pl.col(\"startDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Date, \"%Y-%m-%d\", strict=False),\n", " pl.col(\"endDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Date, \"%Y-%m-%d\", strict=False),\n", " pl.col(\"ecSignatureDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Date, \"%Y-%m-%d\", strict=False),\n", " pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n", " pl.col(\"rcn\").cast(pl.Int64),\n", " ])\n", " .with_columns(\n", " pl.lit(programme_dir.name).alias(\"programmeFolder\") # <-- NEW COLUMN\n", " )\n", " )\n", " elif dataset == \"organization\":\n", " df = df.with_columns([\n", " pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False).str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n", " pl.col(\"totalCost\").cast(pl.Utf8, strict=False).str.replace(\",\",\".\").cast(pl.Float64),\n", " ])\n", " elif dataset == \"projectDeliverables\":\n", " df = df.with_columns([\n", " #pl.col(\"projectID\").cast(pl.Int64),\n", " pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n", " .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n", " ])\n", " elif dataset == \"projectPublications\":\n", " if programme_dir==Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\"):\n", " rename_map = {\n", " \"RECORD_ID\": \"id\",\n", " \"TITLE\": \"title\",\n", " \"AUTHOR\": \"authors\",\n", " \"DOI\": \"doi\",\n", " \"PROJECT_ID\": \"projectID\",\n", " \"JOURNAL_TITLE\": \"journalTitle\",\n", " \"PAGES\": \"publishedPages\",\n", " \"PUBLICATION_TYPE\": \"isPublishedAs\",\n", " }\n", "\n", " df = df.rename(rename_map)\n", " else:\n", " df = df.with_columns([\n", " pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n", " .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n", " pl.col(\"id\").cast(pl.Utf8, strict=False)\n", " .str.extract(r\"^(\\d+)_\", 1)\n", " #.cast(pl.Int64)\n", " .alias(\"projectID\"),\n", " ])\n", " elif dataset == \"reportSummaries\":\n", " df = df.with_columns(\n", " pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n", " .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n", " )\n", " elif dataset == \"organization\":\n", " df = df.with_columns([\n", " pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n", " .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n", " pl.col(\"totalCost\").cast(pl.Utf8, strict=False)\n", " .str.replace(\",\", \".\")\n", " .cast(pl.Float64),\n", " ])\n", " elif dataset == \"webItem\":\n", " df = df.with_columns(\n", " pl.col(\"uri\").cast(pl.Utf8, strict=False)\n", " .str.extract(r\"/files/\\d+/(\\d+)/\", 1)\n", " .cast(pl.Int64)\n", " .alias(\"projectID\"),\n", " )\n", "\n", " # ---------------------------------------------------------------\n", " combined.append(df)\n", "\n", " # --------------------------------------------------------------------\n", " # Write out per-dataset parquet\n", " # --------------------------------------------------------------------\n", " if combined:\n", " how=\"vertical_relaxed\"\n", " if dataset==\"projectPublications\":\n", " how=\"diagonal\"\n", " result = pl.concat(combined, how=how)\n", " parquet_path = OUTDIR / f\"{dataset}_all.parquet\"\n", " result.write_parquet(parquet_path)\n", " print(f\"✔ {dataset:15s} → {parquet_path}\")" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ project → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\project_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ projectDeliverables → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\projectDeliverables_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ projectPublications → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\projectPublications_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ reportSummaries → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\reportSummaries_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ organization → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\organization_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ euroSciVoc → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\euroSciVoc_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ topics → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\topics_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ webItem → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\webItem_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ webLink → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\webLink_all.parquet\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n", "C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n", "✔ legalBasis → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\legalBasis_all.parquet\n" ] } ], "source": [ "combine_all_programmes()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Consolidate projects" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pathlib\n", "import polars as pl\n", "\n", "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n", "OUTDIR = ROOT / \"combined\"\n", "DATASETS = [\n", " \"project\",\n", " \"projectDeliverables\",\n", " \"projectPublications\",\n", " \"reportSummaries\",\n", " \"organization\",\n", " \"euroSciVoc\",\n", " \"topics\",\n", " \"webItem\",\n", " \"webLink\",\n", " \"legalBasis\",\n", "]\n", "\n", "dfs = {}\n", "for dataset in DATASETS:\n", " path = OUTDIR / f\"{dataset}_all.parquet\"\n", " dfs[dataset] = pl.read_parquet(path)\n", "\n", "projects = dfs[\"project\"]\n", "\n", "projects_deliv = (\n", " dfs[\"projectDeliverables\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"deliverableType\").alias(\"list_deliverableType\"),\n", " pl.col(\"url\") .alias(\"list_url\"),\n", " pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n", " ])\n", ")\n", "\n", "projects_publi = (\n", " dfs[\"projectPublications\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"authors\") .alias(\"list_authors\"),\n", " pl.col(\"title\") .alias(\"list_title\"),\n", " pl.col(\"doi\") .alias(\"list_doi\"),\n", " pl.col(\"journalTitle\") .alias(\"list_journalTitle\"),\n", " pl.col(\"isPublishedAs\") .alias(\"list_isPublishedAs\"),\n", " pl.col(\"publishedYear\") .alias(\"list_publishedYear\"),\n", " pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n", " ])\n", ")\n", "\n", "report = (\n", " dfs[\"reportSummaries\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"title\") .alias(\"list_title\"),\n", " pl.col(\"attachment\") .alias(\"list_attachment\"),\n", " pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n", " ])\n", ")\n", "\n", "org = (\n", " dfs[\"organization\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"organisationID\") .alias(\"list_organisationID\"),\n", " pl.col(\"country\") .alias(\"list_country\"),\n", " pl.col(\"name\") .alias(\"list_name\"),\n", " pl.col(\"SME\") .alias(\"list_SME\"),\n", " pl.col(\"city\") .alias(\"list_city\"),\n", " pl.col(\"geolocation\") .alias(\"list_geolocation\"),\n", " pl.col(\"organizationURL\") .alias(\"list_organizationURL\"),\n", " pl.col(\"role\") .alias(\"list_role\"),\n", " pl.col(\"ecContribution\") .alias(\"list_ecContribution\"),\n", " pl.col(\"netEcContribution\").alias(\"list_netEcContribution\"),\n", " pl.col(\"totalCost\") .alias(\"list_totalCost\"),\n", " pl.col(\"endOfParticipation\").alias(\"list_endOfParticipation\"),\n", " pl.col(\"activityType\") .alias(\"list_activityType\"),\n", " pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n", " ])\n", ")\n", "\n", "voc = (\n", " dfs[\"euroSciVoc\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"euroSciVocTitle\") .alias(\"list_euroSciVocTitle\"),\n", " pl.col(\"euroSciVocPath\") .alias(\"list_euroSciVocPath\"),\n", " pl.col(\"euroSciVocDescription\").alias(\"list_description\"),\n", " ])\n", ")\n", "\n", "topic = (\n", " dfs[\"topics\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"topic\") .alias(\"list_topic\"),\n", " pl.col(\"title\") .alias(\"list_title\"),\n", " ])\n", ")\n", "\n", "web_item = dfs[\"webItem\"] # no aggregation\n", "\n", "web_link = (\n", " dfs[\"webLink\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"physUrl\") .alias(\"list_physUrl\"),\n", " pl.col(\"availableLanguages\") .alias(\"list_availableLanguages\"),\n", " pl.col(\"status\") .alias(\"list_status\"),\n", " pl.col(\"archivedDate\") .alias(\"list_archivedDate\"),\n", " pl.col(\"type\") .alias(\"list_type\"),\n", " pl.col(\"source\") .alias(\"list_source\"),\n", " pl.col(\"represents\") .alias(\"list_represents\"),\n", " ])\n", ")\n", "\n", "legal = (\n", " dfs[\"legalBasis\"]\n", " .group_by(\"projectID\")\n", " .agg([\n", " pl.col(\"legalBasis\") .alias(\"list_legalBasis\"),\n", " pl.col(\"title\") .alias(\"list_title\"),\n", " pl.col(\"uniqueProgrammePart\").alias(\"list_uniqueProgrammePart\"),\n", " ])\n", ")\n", "\n", "consolidated = (\n", " projects\n", " .join(projects_deliv, left_on=\"id\", right_on=\"projectID\", suffix=\"_deliv\", how=\"left\")\n", " .join(projects_publi, left_on=\"id\", right_on=\"projectID\", suffix=\"_publi\", how=\"left\")\n", " .join(report, left_on=\"id\", right_on=\"projectID\", suffix=\"_report\", how=\"left\")\n", " .join(org, left_on=\"id\", right_on=\"projectID\", suffix=\"_org\", how=\"left\")\n", " .join(web_link, left_on=\"id\", right_on=\"projectID\", suffix=\"_link\", how=\"left\")\n", " .join(legal, left_on=\"id\", right_on=\"projectID\", suffix=\"_legal\", how=\"left\")\n", " .join(topic, left_on=\"id\", right_on=\"projectID\", suffix=\"_topic\", how=\"left\")\n", " .join(voc, left_on=\"id\", right_on=\"projectID\", suffix=\"_voc\", how=\"left\")\n", ")\n", "\n", "for col in [\"startDate\", \"endDate\"]:\n", " if consolidated[col].dtype == pl.Utf8:\n", " consolidated = consolidated.with_column(\n", " pl.col(col).str.strptime(pl.Date, \"%Y-%m-%d\").alias(col)\n", " )\n", "\n", "consolidated = consolidated.with_columns(\n", " pl.col(\"list_netEcContribution\").list.eval(pl.element().cast(pl.Float64),parallel=True)\n", " .list.sum().alias(\"netEcContribution\")\n", ")\n", "\n", "consolidated = consolidated.with_columns(\n", " pl.col(\"totalCost\").cast(pl.Float64),\n", " pl.col(\"netEcContribution\").cast(pl.Float64)\n", ")\n", "\n", "consolidated = consolidated.with_columns([\n", " pl.col(\"startDate\").dt.year().alias(\"startYear\"),\n", " pl.col(\"endDate\"). dt.year().alias(\"endYear\"),\n", " (pl.col(\"endDate\") - pl.col(\"startDate\")).dt.total_days().alias(\"durationDays\"),\n", " (pl.col(\"netEcContribution\") / pl.col(\"totalCost\")).alias(\"ecRatio\"),\n", "])\n", "\n", "consolidated.write_parquet(OUTDIR / \"consolidated.parquet\")\n", "\n", "excluded_frameworks = [\"FP1\", \"FP2\", \"FP3\", \"FP4\", \"FP5\", \"FP6\"]\n", "\n", "consolidated_clean = (consolidated.filter(~pl.col(\"frameworkProgramme\").is_in(excluded_frameworks)))\n", "\n", "consolidated_clean.write_parquet(OUTDIR / \"consolidated_clean.parquet\")\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import pathlib\n", "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\MDA\\backend\\data\")\n", "OUTDIR = ROOT #/ \"combined\"\n", "\n", "#consolidated = pl.read_parquet(OUTDIR / \"consolidated.parquet\")\n", "consolidated_clean = pl.read_parquet(OUTDIR / \"consolidated_clean.parquet\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 68)
idacronymstatustitlestartDateendDatetotalCostecMaxContributionlegalBasistopicsecSignatureDateframeworkProgrammemasterCallsubCallfundingSchemenatureobjectivecontentUpdateDatercngrantDoiprogrammeFolderlist_deliverableTypelist_urllist_contentUpdateDatelist_authorslist_titlelist_doilist_journalTitlelist_isPublishedAslist_publishedYearlist_contentUpdateDate_publilist_title_reportlist_attachmentlist_contentUpdateDate_reportlist_organisationIDlist_countrylist_namelist_SMElist_citylist_geolocationlist_organizationURLlist_rolelist_ecContributionlist_netEcContributionlist_totalCostlist_endOfParticipationlist_activityTypelist_contentUpdateDate_orglist_physUrllist_availableLanguageslist_statuslist_archivedDatelist_typelist_sourcelist_representslist_legalBasislist_title_legallist_uniqueProgrammePartlist_topiclist_title_topiclist_euroSciVocTitlelist_euroSciVocPathlist_descriptionnetEcContributionstartYearendYeardurationDaysecRatio
strstrstrstrdatedatef64f64strstrdatestrstrstrstrstrstrdatetime[μs]i64strstrlist[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[datetime[μs]]list[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[f64]list[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]f64i32i32i64f64
"624794""COMPACTABILITY""CLOSED""Contribution of Compact Neighb…2014-12-012016-11-30309235.2309235.2"FP7-PEOPLE""FP7-PEOPLE-2013-IEF"null"FP7"null"FP7-PEOPLE-2013-IEF""MC-IEF"null"This research investigates how…2017-04-10 11:25:29187874null"H2013"nullnullnullnullnullnullnullnullnullnull["Final Report Summary - COMPACTABILITY (Contribution of Compact Neighbourhoods to Social Sustainability)"]["/docs/results/624/624794/final1-table-1.jpg"][2017-03-07 17:25:15]["999446873"]["UK"]["OXFORD BROOKES UNIVERSITY"][null]["Oxford"]["51.7520131,-1.2578498"]["http://www.brookes.ac.uk"]["coordinator"]["309235.2"][null][null]["false"]["HES"][2017-04-10 11:25:29]nullnullnullnullnullnullnull["FP7-PEOPLE"]["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"][null]["FP7-PEOPLE-2013-IEF"]["Marie-Curie Action: Intra-European fellowships for career development"]nullnullnull0.0201420167300.0
"276810""ARCHOSL""CLOSED""Archives of Early Human Occupa…2011-03-012014-02-2875000.075000.0"FP7-PEOPLE""FP7-PEOPLE-2009-RG"null"FP7"null"FP7-PEOPLE-2010-RG""MC-IRG"null"A number of important archaeol…2019-08-02 13:24:5198178null"H2013"nullnullnull["Arnold, L.J., Demuro, M., Parés, J.M., Arsuaga, J.L., Aranburu, A.,", "Lee J. Arnold , Martina Demuro , Marta Navazo , Alfonso Benito-Calvo , Alfredo Pérez-González", … "F. Gutiérrez , B. Valero-Garcés , G. Desir , P. González-Sampériz , M. Gutiérrez , R. Linares , M. Zarroca , A. Moreno , J. Guerrero , C. Roqué"]["Luminescence dating and palaeomagnetic age constraint on hominins from Sima de los Huesos, Atapuerca, Spain", "OSL dating of the Middle Palaeolithic Hotel California site, Sierra de Atapuerca, north-central Spain", … "Late Holocene evolution of playa lakes in the central Ebro depression based on geophysical surveys and morpho-stratigraphic analysis of lacustrine terraces"]["http://dx.doi.org/10.1016/j.jhevol.2013.12.001", "10.1111/j.1502-3885.2012.00262.x", … "http://dx.doi.org/10.1016/j.geomorph.2012.02.013"]["Journal of Human Evolution", "Boreas", … "Geomorphology"]["PEER REVIEWED ARTICLE", "PEER REVIEWED ARTICLE", … "PEER REVIEWED ARTICLE"][null, null, … null][null, null, … null]["Final Report Summary - ARCHOSL (Archives of Early Human Occupation in Western Europe: OSL Chronologies beyond the Middle Pleistocene in the Iberian Peninsula)"][null][2014-11-07 13:26:06]["986579241"]["ES"]["CENTRO NACIONAL DE INVESTIGACION SOBRE LA EVOLUCION HUMANA"][null]["Burgos"]["42.3396185,-3.6967044"]["http://www.cenieh.es"]["coordinator"]["75000"][null][null]["false"]["REC"][2019-08-02 13:24:51]nullnullnullnullnullnullnull["FP7-PEOPLE"]["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"][null]["FP7-PEOPLE-2009-RG"]["Marie Curie Action: Reintegration Grants"]["ethnoarchaeology", "physical anthropology"]["/humanities/history and archaeology/archaeology/ethnoarchaeology", "/social sciences/sociology/anthropology/physical anthropology"][null, null]0.02011201410950.0
"622478""DETforDRF 2.0""CLOSED""Design and Expansion Turbine f…nullnull161968.8161968.8"FP7-PEOPLE""FP7-PEOPLE-2013-IEF"null"FP7"null"FP7-PEOPLE-2013-IEF""MC-IEF"null"This proposal for a Marie Curi…2016-03-31 21:10:31187686null"H2013"nullnullnullnullnullnullnullnullnullnullnullnullnull["953573536"]["DE"]["BSH HAUSGERATE GMBH"][null]["Munchen"]["48.0887063,11.6433468"]["http://www.bsh-group.com"]["coordinator"]["161968.8"][null][null]["false"]["PRC"][2016-03-31 21:10:31]nullnullnullnullnullnullnull["FP7-PEOPLE"]["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"][null]["FP7-PEOPLE-2013-IEF"]["Marie-Curie Action: Intra-European fellowships for career development"]["fluid dynamics"]["/natural sciences/physical sciences/classical mechanics/fluid mechanics/fluid dynamics"][null]0.0nullnullnull0.0
"615785""EMERGING SUBJECTS""CLOSED""Emerging Subjects of the New E…2014-09-012019-06-301.658373e61.658373e6"FP7-IDEAS-ERC""ERC-CG-2013-SH2"null"FP7"null"ERC-2013-CoG""ERC-CG"null"This project examines how pred…2023-04-05 11:40:06188675null"H2013"nullnullnull["Rebekah Plueckhahn", "Dulam, Bumochir", … "•Empson, R. A."]["Tragic Spirits: Shamanism, Memory, and Gender in Contemporary Mongolia by Manduhai Buyandelger.", "The Afterlife of Nomadism: Pastoralism, environmentalism, civilization and identity in Mongolia and China", … "A Space That Will Never Be Filled Sharp Communication and the Simultaneity of Opposites."]["10.1111/aman.12304", null, … null]["American Anthropologist", "Pastoralist Livelihoods in Asian Drylands: Environment, Governance and Risk", … "Current Anthropology"]["PEER_REVIEWED_ARTICLE", "ARTICLE", … "ARTICLE"][null, null, … null][null, null, … null]["Final Report Summary - EMERGING SUBJECTS (Emerging Subjects of the New Economy: Tracing Economic Growth in Mongolia)"][null][2018-01-15 17:25:25]["888898146"][null]["UNIVERSITY COLLEGE LONDON"][null]["LONDON"]["51.5236746,-0.1339608"]["http://www.ucl.ac.uk"]["coordinator"]["1658373"][null][null]["false"]["HES"][2023-04-05 11:40:06]nullnullnullnullnullnullnull["FP7-IDEAS-ERC"]["Specific programme: "Ideas" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"][null]["ERC-CG-2013-SH2"]["ERC Consolidator Grant - Institutions Values Beliefs and behaviour"]["anthropology"]["/social sciences/sociology/anthropology"][null]0.02014201917630.0
"237010""DEER PALAEOBIOLOGY""CLOSED""Palaeobiological inference thr…2009-04-092011-01-08173416.47173416.47"FP7-PEOPLE""FP7-PEOPLE-IEF-2008"null"FP7"null"FP7-PEOPLE-IEF-2008""MC-IEF"null"The present research aims to r…2019-07-16 19:18:2590424null"H2013"nullnullnull["Lister, A.M., Breda, M. and others", "Breda, M., Lister, A.M. & others"]["Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. II. Cervidae, Equidae and Suidae.", "Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. I: Rhinocerotidae and Bovidae."][null, null]["Quaternary International", "Quaternary International"]["PEER REVIEWED ARTICLE", "PEER REVIEWED ARTICLE"][null, null][null, null]["Final Report Summary - DEER PALAEOBIOLOGY (Palaeobiological inference through phylogenetic analysis of Pleistocene deer)"][null][2013-07-05 00:02:53]["999642037"]["UK"]["NATURAL HISTORY MUSEUM"][null]["London"]["51.494882,-0.1847716"]["http://www.nhm.ac.uk/"]["coordinator"]["173416.47"][null][null]["false"]["PUB"][2019-07-16 19:18:25]nullnullnullnullnullnullnull["FP7-PEOPLE"]["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"][null]["FP7-PEOPLE-IEF-2008"]["Marie Curie Action: Intra-European Fellowships for Career Development"]["comparative morphology"]["/natural sciences/biological sciences/biological morphology/comparative morphology"][null]0.0200920116390.0
" ], "text/plain": [ "shape: (5, 68)\n", "┌────────┬──────────────┬────────┬──────────────┬───┬───────────┬─────────┬──────────────┬─────────┐\n", "│ id ┆ acronym ┆ status ┆ title ┆ … ┆ startYear ┆ endYear ┆ durationDays ┆ ecRatio │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str ┆ ┆ i32 ┆ i32 ┆ i64 ┆ f64 │\n", "╞════════╪══════════════╪════════╪══════════════╪═══╪═══════════╪═════════╪══════════════╪═════════╡\n", "│ 624794 ┆ COMPACTABILI ┆ CLOSED ┆ Contribution ┆ … ┆ 2014 ┆ 2016 ┆ 730 ┆ 0.0 │\n", "│ ┆ TY ┆ ┆ of Compact ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ Neighb… ┆ ┆ ┆ ┆ ┆ │\n", "│ 276810 ┆ ARCHOSL ┆ CLOSED ┆ Archives of ┆ … ┆ 2011 ┆ 2014 ┆ 1095 ┆ 0.0 │\n", "│ ┆ ┆ ┆ Early Human ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ Occupa… ┆ ┆ ┆ ┆ ┆ │\n", "│ 622478 ┆ DETforDRF ┆ CLOSED ┆ Design and ┆ … ┆ null ┆ null ┆ null ┆ 0.0 │\n", "│ ┆ 2.0 ┆ ┆ Expansion ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ Turbine f… ┆ ┆ ┆ ┆ ┆ │\n", "│ 615785 ┆ EMERGING ┆ CLOSED ┆ Emerging ┆ … ┆ 2014 ┆ 2019 ┆ 1763 ┆ 0.0 │\n", "│ ┆ SUBJECTS ┆ ┆ Subjects of ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ the New E… ┆ ┆ ┆ ┆ ┆ │\n", "│ 237010 ┆ DEER PALAEOB ┆ CLOSED ┆ Palaeobiolog ┆ … ┆ 2009 ┆ 2011 ┆ 639 ┆ 0.0 │\n", "│ ┆ IOLOGY ┆ ┆ ical ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ inference ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ thr… ┆ ┆ ┆ ┆ ┆ │\n", "└────────┴──────────────┴────────┴──────────────┴───┴───────────┴─────────┴──────────────┴─────────┘" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "consolidated_clean.head()\n", "#ecMaxContribution, endDate, status, legalBasis, frameworkProgramme, fundingScheme, list_title_report, list_name, list_role, list_city, list_country, list_ecContribution, list_activityType, durationDays" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RAG should be able to answer questions related to funding for: EuroSciVoc classifications, Topic, Projects, Organizations, legalBasis.\n", "\n", "- Expand each of these datasets with summary statistics (some based on the dataset itself, other based on the counts of status (terminated/closed) etc.)\n", "- Write in the prompt to for example ask if they want more information on the topics of a project, or for example some short information about the different project done by an organisation.\n", "- ..." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "org = pl.read_parquet(OUTDIR / \"organization_all.parquet\")\n", "org_cleaned = (\n", " org.group_by([\"organisationID\",\"name\",\"vatNumber\",\"shortName\",\"SME\",\"activityType\",\"street\",\"postCode\",\"city\",\"country\",\"nutsCode\",\"geolocation\",\"organizationURL\"])\n", " .agg([\n", " pl.col(\"projectID\") .alias(\"list_projectID\"),\n", " pl.col(\"projectAcronym\") .alias(\"list_projectAcronym\"),\n", " pl.col(\"rcn\").alias(\"list_rcn\"),\n", " pl.col(\"order\") .alias(\"list_order\"),\n", " pl.col(\"role\") .alias(\"list_role\"),\n", " pl.col(\"ecContribution\") .alias(\"list_ecContribution\"),\n", " pl.col(\"netEcContribution\") .alias(\"list_netEcContribution\"),\n", " pl.col(\"totalCost\") .alias(\"list_totalCost\"),\n", " pl.col(\"endOfParticipation\") .alias(\"endOfParticipation\"),\n", " ])\n", ")\n", "org_cleaned.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 68)
idacronymstatustitlestartDateendDatetotalCostecMaxContributionlegalBasistopicsecSignatureDateframeworkProgrammemasterCallsubCallfundingSchemenatureobjectivecontentUpdateDatercngrantDoiprogrammeFolderlist_deliverableTypelist_urllist_contentUpdateDatelist_authorslist_titlelist_doilist_journalTitlelist_isPublishedAslist_publishedYearlist_contentUpdateDate_publilist_title_reportlist_attachmentlist_contentUpdateDate_reportlist_organisationIDlist_countrylist_namelist_SMElist_citylist_geolocationlist_organizationURLlist_rolelist_ecContributionlist_netEcContributionlist_totalCostlist_endOfParticipationlist_activityTypelist_contentUpdateDate_orglist_physUrllist_availableLanguageslist_statuslist_archivedDatelist_typelist_sourcelist_representslist_legalBasislist_title_legallist_uniqueProgrammePartlist_topiclist_title_topiclist_euroSciVocTitlelist_euroSciVocPathlist_descriptionnetEcContributionstartYearendYeardurationDaysecRatio
strstrstrstrdatedatef64f64strstrdatestrstrstrstrstrstrdatetime[μs]i64strstrlist[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[datetime[μs]]list[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[f64]list[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]f64i32i32i64f64
"1476"nullnull"Integrated Sensor-Based Robot …1985-01-011988-01-01nullnull"FP1-ESPRIT 1"nullnull"FP1"nullnullnullnull"The objective of this project …1992-12-04 01:04:588338null"H1984"nullnullnullnullnullnullnullnullnullnullnullnullnull[null, null, … null]["UK", "PT", … "UK"]["UNIVERSITY OF NEWCASTLE UPON TYNE", "UNIV NOVA DE LISBOA", … "MARI Applied Technologies Ltd"][null, null, … null]["Newcastle upon Tyne", "CAPARICA", … "Boldon"][null, "38.6633229,-9.2032444", … null][null, null, … null]["participant", "participant", … "coordinator"][null, null, … null][null, null, … null][null, null, … null][null, null, … null][null, null, … null][1992-12-04 01:04:58, 1992-12-04 01:04:58, … 1992-12-04 01:04:58]nullnullnullnullnullnullnull["FP1-ESPRIT 1"]["European programme (EEC) for research and development in information technologies (ESPRIT), 1984-1988"][null]nullnull["software", "sensors"]["/natural sciences/computer and information sciences/software", "/engineering and technology/electrical engineering, electronic engineering, information engineering/electronic engineering/sensors"][null, null]0.0198519881095null
"395""INCA"null"An Integrated Network Architec…1984-09-011989-09-01nullnull"FP1-ESPRIT 1"nullnull"FP1"nullnullnullnull"The principal purpose of the I…1992-12-09 00:00:028633null"H1984"nullnullnullnullnullnullnullnullnullnullnullnullnull[null, null, … null]["IT", "IT", … "UK"]["Ingegneria C. Olivetti and C. SpA", "System Wizards Srl", … "Birkbeck College, University of London"][null, null, … null]["Pozzuoli Napoli", "Torino", … "London"]["40.8349492,14.1067963", "45.0993613,7.6747093", … "51.5199928,-0.1299654"][null, null, … null]["participant", "participant", … "participant"][null, null, … null][null, null, … null][null, null, … null][null, null, … null][null, null, … null][1992-12-09 00:00:02, 1992-12-09 00:00:02, … 1992-12-09 00:00:02]nullnullnullnullnullnullnull["FP1-ESPRIT 1"]["European programme (EEC) for research and development in information technologies (ESPRIT), 1984-1988"][null]nullnull["software", "telecommunications"]["/natural sciences/computer and information sciences/software", "/engineering and technology/electrical engineering, electronic engineering, information engineering/information engineering/telecommunications"][null, null]0.0198419891826null
"EN3M0096"nullnull"IMPROVEMENT AND APPLICATION OF…1988-07-011989-02-28nullnull"FP1-ENNONUC 3C"nullnull"FP1"nullnull"CSC"null"THERE WILL BE NUMEROUS SCIENTI…1994-01-23 00:00:0212659null"H1984"nullnullnullnullnullnullnullnullnullnullnullnullnull[null]["FR"]["Chambre de Commerce et d'Industrie de Paris (CCIP)"][null]["Paris"]["46.6769224,-1.4244831"][null]["coordinator"][null][null][null][null][null][1994-01-23 00:00:02]nullnullnullnullnullnullnull["FP1-ENNONUC 3C"]["Research and development programme (EEC) in the field of Non-Nuclear Energy, 1985-1988"][null]nullnull["software"]["/natural sciences/computer and information sciences/software"][null]0.019881989242null
"874""CONCORDIA"null"Integrated Environment for Rel…1985-12-011986-12-01nullnull"FP1-ESPRIT 1"nullnull"FP1"nullnullnullnull"The CONCORDIA project aimed to…1992-12-02 00:02:558413null"H1984"nullnullnullnullnullnullnullnullnullnullnullnullnull[null, null, … null]["IT", "FR", … "UK"]["Telettra SpA", "JEAN LEFEBVRE TELECOM", … "MARI Applied Technologies Ltd"][null, null, … null]["Bologna", "PUTEAUX", … "Boldon"]["44.5009260,11.3481129", "48.8811551,2.2457420", … null][null, null, … null]["participant", "participant", … "coordinator"][null, null, … null][null, null, … null][null, null, … null][null, null, … null][null, null, … null][1992-12-02 00:02:55, 1992-12-02 00:02:55, … 1992-12-02 00:02:55]nullnullnullnullnullnullnull["FP1-ESPRIT 1"]["European programme (EEC) for research and development in information technologies (ESPRIT), 1984-1988"][null]nullnull["software"]["/natural sciences/computer and information sciences/software"][null]0.019851986365null
"EV4T0018"nullnull"ANALYSIS OF OPTICAL AND THERMA…1987-12-011990-11-30nullnull"FP1-TECHHAZ C"nullnull"FP1"nullnull"CSC"null"DEVELOPMENT OF APPROPRIATE DIG…1994-01-23 00:00:0211996null"H1984"nullnullnullnullnullnullnullnullnullnullnullnullnull[null]["BE"]["VON KARMAN INSTITUTE FOR FLUID DYNAMICS"][null]["RHODE-ST-GENESE"]["50.7562383,4.3873549"][null]["coordinator"][null][null][null][null][null][1994-01-23 00:00:02]nullnullnullnullnullnullnull["FP1-TECHHAZ C"]["Multiannual R&D programmes (EEC) in the field of the environment - Pilot projects on major technological hazards -, 1986-1990"][null]nullnull["graphic design", "software", … "laser physics"]["/social sciences/media and communications/graphic design", "/natural sciences/computer and information sciences/software", … "/natural sciences/physical sciences/optics/laser physics"][null, null, … null]0.0198719901095null
" ], "text/plain": [ "shape: (5, 68)\n", "┌──────────┬───────────┬────────┬───────────────┬───┬───────────┬─────────┬──────────────┬─────────┐\n", "│ id ┆ acronym ┆ status ┆ title ┆ … ┆ startYear ┆ endYear ┆ durationDays ┆ ecRatio │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str ┆ ┆ i32 ┆ i32 ┆ i64 ┆ f64 │\n", "╞══════════╪═══════════╪════════╪═══════════════╪═══╪═══════════╪═════════╪══════════════╪═════════╡\n", "│ 1476 ┆ null ┆ null ┆ Integrated ┆ … ┆ 1985 ┆ 1988 ┆ 1095 ┆ null │\n", "│ ┆ ┆ ┆ Sensor-Based ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ Robot … ┆ ┆ ┆ ┆ ┆ │\n", "│ 395 ┆ INCA ┆ null ┆ An Integrated ┆ … ┆ 1984 ┆ 1989 ┆ 1826 ┆ null │\n", "│ ┆ ┆ ┆ Network ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ Architec… ┆ ┆ ┆ ┆ ┆ │\n", "│ EN3M0096 ┆ null ┆ null ┆ IMPROVEMENT ┆ … ┆ 1988 ┆ 1989 ┆ 242 ┆ null │\n", "│ ┆ ┆ ┆ AND ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ APPLICATION ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ OF… ┆ ┆ ┆ ┆ ┆ │\n", "│ 874 ┆ CONCORDIA ┆ null ┆ Integrated ┆ … ┆ 1985 ┆ 1986 ┆ 365 ┆ null │\n", "│ ┆ ┆ ┆ Environment ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ for Rel… ┆ ┆ ┆ ┆ ┆ │\n", "│ EV4T0018 ┆ null ┆ null ┆ ANALYSIS OF ┆ … ┆ 1987 ┆ 1990 ┆ 1095 ┆ null │\n", "│ ┆ ┆ ┆ OPTICAL AND ┆ ┆ ┆ ┆ ┆ │\n", "│ ┆ ┆ ┆ THERMA… ┆ ┆ ┆ ┆ ┆ │\n", "└──────────┴───────────┴────────┴───────────────┴───┴───────────┴─────────┴──────────────┴─────────┘" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "consolidated.head()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['en' 'de' 'it' 'sw' 'ro' 'pl' 'pt' 'ca' 'af' 'es' 'nl' 'no' 'da' 'vi'\n", " 'et' 'fr' 'cy' 'tl' 'so' 'sv' 'tr' 'id' 'lt' 'hu' 'fi' 'hr' 'sl'\n", " 'unknown' 'sk' 'cs']\n", "title\n", "en 56676\n", "de 1012\n", "it 343\n", "ro 238\n", "ca 224\n", "fr 144\n", "da 131\n", "es 81\n", "tl 69\n", "vi 69\n", "nl 61\n", "pt 60\n", "af 46\n", "no 43\n", "id 36\n", "so 29\n", "sv 24\n", "pl 15\n", "cy 14\n", "et 11\n", "fi 10\n", "sw 9\n", "hr 6\n", "sl 5\n", "lt 3\n", "tr 2\n", "unknown 2\n", "hu 1\n", "sk 1\n", "cs 1\n", "Name: count, dtype: int64\n" ] } ], "source": [ "from langdetect import detect\n", "\n", "def is_english(text):\n", " try:\n", " return detect(text) == 'en'\n", " except:\n", " return False\n", "\n", "def lang_text(text):\n", " try:\n", " return detect(text)\n", " except:\n", " return \"unknown\"\n", "\n", "languages=consolidated_clean.filter(pl.col('status').is_not_null()).to_pandas()['title'].apply(lang_text)\n", "unique_languages = languages.unique()\n", "print(unique_languages)\n", "language_counts = languages.value_counts()\n", "print(language_counts)\n", "\n", "#print(\"English coverage:\", consolidated.to_pandas()['title'].apply(is_english).mean())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🧹 Preparing data...\n", "💡 Embedding text...\n", "Loading saved embeddings for column 'title'...\n", "Fitting SVD for column 'title'...\n", "Loading saved embeddings for column 'objective'...\n", "Fitting SVD for column 'objective'...\n", "Loading saved embeddings for column 'topic_title'...\n", "Fitting SVD for column 'topic_title'...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[I 2025-05-12 21:30:14,899] A new study created in memory with name: no-name-7695811c-115a-4d9f-a17b-3fc16faf602a\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "🧱 Building pipeline...\n", "🎯 Training model with Optuna...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BB', 'BH', 'BN', 'CF', 'GF', 'GT', 'HN', 'KW', 'LR', 'MR', 'OM', 'QA', 'SL', 'SM', 'ST', 'VG'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BUILDING B'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['66C27239DA0384FFDD662CB8A543DBAB', 'AD460A03C5F2CBDC967586AF7495D308'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['ADMIRALTY LAW', 'BIOLOGICAL BEHAVIOURAL SCIENCES', 'BREAST IMPLANTS', 'ETHICAL THEORIES', 'INERTIAL NAVIGATION SYSTEM', 'NUCLEAR CHEMISTRY', 'OTHER AGRICULTURAL SCIENCES', 'OTHER MEDICAL SCIENCES'] will be ignored\n", " warnings.warn(\n", "[I 2025-05-12 21:35:06,892] Trial 0 finished with value: 0.42635007849963164 and parameters: {'n_estimators': 151, 'max_depth': 5, 'learning_rate': 0.27984807723222593, 'scale_pos_weight': 6.072340577651307}. Best is trial 0 with value: 0.42635007849963164.\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BB', 'BH', 'BN', 'CF', 'GF', 'GT', 'HN', 'KW', 'LR', 'MR', 'OM', 'QA', 'SL', 'SM', 'ST', 'VG'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BUILDING B'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['66C27239DA0384FFDD662CB8A543DBAB', 'AD460A03C5F2CBDC967586AF7495D308'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['ADMIRALTY LAW', 'BIOLOGICAL BEHAVIOURAL SCIENCES', 'BREAST IMPLANTS', 'ETHICAL THEORIES', 'INERTIAL NAVIGATION SYSTEM', 'NUCLEAR CHEMISTRY', 'OTHER AGRICULTURAL SCIENCES', 'OTHER MEDICAL SCIENCES'] will be ignored\n", " warnings.warn(\n", "[I 2025-05-12 21:42:41,795] Trial 1 finished with value: 0.4292899461188686 and parameters: {'n_estimators': 175, 'max_depth': 5, 'learning_rate': 0.18982366215962182, 'scale_pos_weight': 2.954471437724056}. Best is trial 1 with value: 0.4292899461188686.\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BB', 'BH', 'BN', 'CF', 'GF', 'GT', 'HN', 'KW', 'LR', 'MR', 'OM', 'QA', 'SL', 'SM', 'ST', 'VG'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BUILDING B'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['66C27239DA0384FFDD662CB8A543DBAB', 'AD460A03C5F2CBDC967586AF7495D308'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['ADMIRALTY LAW', 'BIOLOGICAL BEHAVIOURAL SCIENCES', 'BREAST IMPLANTS', 'ETHICAL THEORIES', 'INERTIAL NAVIGATION SYSTEM', 'NUCLEAR CHEMISTRY', 'OTHER AGRICULTURAL SCIENCES', 'OTHER MEDICAL SCIENCES'] will be ignored\n", " warnings.warn(\n", "[I 2025-05-12 21:52:48,155] Trial 2 finished with value: 0.4289493329967354 and parameters: {'n_estimators': 156, 'max_depth': 10, 'learning_rate': 0.13243428465407345, 'scale_pos_weight': 3.250588524606016}. Best is trial 1 with value: 0.4292899461188686.\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n", " warnings.warn(\n", "c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n", " warnings.warn(\n", "[W 2025-05-12 22:00:45,953] Trial 3 failed with parameters: {'n_estimators': 117, 'max_depth': 9, 'learning_rate': 0.09612739086701036, 'scale_pos_weight': 9.26268599189327} because of the following error: KeyboardInterrupt().\n", "Traceback (most recent call last):\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py\", line 197, in _run_trial\n", " value_or_values = func(trial)\n", " ^^^^^^^^^^^\n", " File \"C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_43912\\262172098.py\", line 256, in objective\n", " scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(3, shuffle=True, random_state=42),\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 213, in wrapper\n", " return func(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 712, in cross_val_score\n", " cv_results = cross_validate(\n", " ^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 213, in wrapper\n", " return func(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 423, in cross_validate\n", " results = parallel(\n", " ^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py\", line 74, in __call__\n", " return super().__call__(iterable_with_config)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py\", line 1918, in __call__\n", " return output if self.return_generator else list(output)\n", " ^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py\", line 1847, in _get_sequential_output\n", " res = func(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py\", line 136, in __call__\n", " return self.function(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n", " estimator.fit(X_train, y_train, **fit_params)\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n", " return fit_method(estimator, *args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py\", line 329, in fit\n", " Xt, yt = self._fit(X, y, routed_params)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py\", line 265, in _fit\n", " X, y, fitted_transformer = fit_resample_one_cached(\n", " ^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\memory.py\", line 312, in __call__\n", " return self.func(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py\", line 1057, in _fit_resample_one\n", " X_res, y_res = sampler.fit_resample(X, y, **params.get(\"fit_resample\", {}))\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 208, in fit_resample\n", " return super().fit_resample(X, y)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 112, in fit_resample\n", " output = self._fit_resample(X, y)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\combine\\_smote_enn.py\", line 161, in _fit_resample\n", " return self.enn_.fit_resample(X_res, y_res)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 208, in fit_resample\n", " return super().fit_resample(X, y)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 112, in fit_resample\n", " output = self._fit_resample(X, y)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\under_sampling\\_prototype_selection\\_edited_nearest_neighbours.py\", line 168, in _fit_resample\n", " nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\neighbors\\_base.py\", line 849, in kneighbors\n", " results = ArgKmin.compute(\n", " ^^^^^^^^^^^^^^^^\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_pairwise_distances_reduction\\_dispatcher.py\", line 278, in compute\n", " return ArgKmin64.compute(\n", " ^^^^^^^^^^^^^^^^^^\n", " File \"sklearn\\\\metrics\\\\_pairwise_distances_reduction\\\\_argkmin.pyx\", line 59, in sklearn.metrics._pairwise_distances_reduction._argkmin.ArgKmin64.compute\n", " File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\threadpoolctl.py\", line 592, in __exit__\n", " def __exit__(self, type, value, traceback):\n", " \n", "KeyboardInterrupt\n", "[W 2025-05-12 22:00:46,241] Trial 3 failed with value None.\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m status_prediction_model(consolidated_clean\u001b[38;5;241m.\u001b[39mto_pandas())\n", "Cell \u001b[1;32mIn[2], line 261\u001b[0m, in \u001b[0;36mstatus_prediction_model\u001b[1;34m(df)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m scores\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m 260\u001b[0m study \u001b[38;5;241m=\u001b[39m optuna\u001b[38;5;241m.\u001b[39mcreate_study(direction\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmaximize\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m--> 261\u001b[0m study\u001b[38;5;241m.\u001b[39moptimize(objective, n_trials\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n\u001b[0;32m 262\u001b[0m best_params \u001b[38;5;241m=\u001b[39m study\u001b[38;5;241m.\u001b[39mbest_trial\u001b[38;5;241m.\u001b[39mparams\n\u001b[0;32m 263\u001b[0m base_model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mbest_params)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\study.py:475\u001b[0m, in \u001b[0;36mStudy.optimize\u001b[1;34m(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[0;32m 373\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21moptimize\u001b[39m(\n\u001b[0;32m 374\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 375\u001b[0m func: ObjectiveFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 382\u001b[0m show_progress_bar: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 383\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 384\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Optimize an objective function.\u001b[39;00m\n\u001b[0;32m 385\u001b[0m \n\u001b[0;32m 386\u001b[0m \u001b[38;5;124;03m Optimization is done by choosing a suitable set of hyperparameter values from a given\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 473\u001b[0m \u001b[38;5;124;03m If nested invocation of this method occurs.\u001b[39;00m\n\u001b[0;32m 474\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 475\u001b[0m _optimize(\n\u001b[0;32m 476\u001b[0m study\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 477\u001b[0m func\u001b[38;5;241m=\u001b[39mfunc,\n\u001b[0;32m 478\u001b[0m n_trials\u001b[38;5;241m=\u001b[39mn_trials,\n\u001b[0;32m 479\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[0;32m 480\u001b[0m n_jobs\u001b[38;5;241m=\u001b[39mn_jobs,\n\u001b[0;32m 481\u001b[0m catch\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mtuple\u001b[39m(catch) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(catch, Iterable) \u001b[38;5;28;01melse\u001b[39;00m (catch,),\n\u001b[0;32m 482\u001b[0m callbacks\u001b[38;5;241m=\u001b[39mcallbacks,\n\u001b[0;32m 483\u001b[0m gc_after_trial\u001b[38;5;241m=\u001b[39mgc_after_trial,\n\u001b[0;32m 484\u001b[0m show_progress_bar\u001b[38;5;241m=\u001b[39mshow_progress_bar,\n\u001b[0;32m 485\u001b[0m )\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:63\u001b[0m, in \u001b[0;36m_optimize\u001b[1;34m(study, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_jobs \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m---> 63\u001b[0m _optimize_sequential(\n\u001b[0;32m 64\u001b[0m study,\n\u001b[0;32m 65\u001b[0m func,\n\u001b[0;32m 66\u001b[0m n_trials,\n\u001b[0;32m 67\u001b[0m timeout,\n\u001b[0;32m 68\u001b[0m catch,\n\u001b[0;32m 69\u001b[0m callbacks,\n\u001b[0;32m 70\u001b[0m gc_after_trial,\n\u001b[0;32m 71\u001b[0m reseed_sampler_rng\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 72\u001b[0m time_start\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 73\u001b[0m progress_bar\u001b[38;5;241m=\u001b[39mprogress_bar,\n\u001b[0;32m 74\u001b[0m )\n\u001b[0;32m 75\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 76\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_jobs \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:160\u001b[0m, in \u001b[0;36m_optimize_sequential\u001b[1;34m(study, func, n_trials, timeout, catch, callbacks, gc_after_trial, reseed_sampler_rng, time_start, progress_bar)\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 160\u001b[0m frozen_trial \u001b[38;5;241m=\u001b[39m _run_trial(study, func, catch)\n\u001b[0;32m 161\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 162\u001b[0m \u001b[38;5;66;03m# The following line mitigates memory problems that can be occurred in some\u001b[39;00m\n\u001b[0;32m 163\u001b[0m \u001b[38;5;66;03m# environments (e.g., services that use computing containers such as GitHub Actions).\u001b[39;00m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;66;03m# Please refer to the following PR for further details:\u001b[39;00m\n\u001b[0;32m 165\u001b[0m \u001b[38;5;66;03m# https://github.com/optuna/optuna/pull/325.\u001b[39;00m\n\u001b[0;32m 166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gc_after_trial:\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:248\u001b[0m, in \u001b[0;36m_run_trial\u001b[1;34m(study, func, catch)\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShould not reach.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 244\u001b[0m frozen_trial\u001b[38;5;241m.\u001b[39mstate \u001b[38;5;241m==\u001b[39m TrialState\u001b[38;5;241m.\u001b[39mFAIL\n\u001b[0;32m 245\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m func_err \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 246\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(func_err, catch)\n\u001b[0;32m 247\u001b[0m ):\n\u001b[1;32m--> 248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m func_err\n\u001b[0;32m 249\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m frozen_trial\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:197\u001b[0m, in \u001b[0;36m_run_trial\u001b[1;34m(study, func, catch)\u001b[0m\n\u001b[0;32m 195\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m get_heartbeat_thread(trial\u001b[38;5;241m.\u001b[39m_trial_id, study\u001b[38;5;241m.\u001b[39m_storage):\n\u001b[0;32m 196\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 197\u001b[0m value_or_values \u001b[38;5;241m=\u001b[39m func(trial)\n\u001b[0;32m 198\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mTrialPruned \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 199\u001b[0m \u001b[38;5;66;03m# TODO(mamu): Handle multi-objective cases.\u001b[39;00m\n\u001b[0;32m 200\u001b[0m state \u001b[38;5;241m=\u001b[39m TrialState\u001b[38;5;241m.\u001b[39mPRUNED\n", "Cell \u001b[1;32mIn[2], line 256\u001b[0m, in \u001b[0;36mstatus_prediction_model..objective\u001b[1;34m(trial)\u001b[0m\n\u001b[0;32m 254\u001b[0m base_model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams)\n\u001b[0;32m 255\u001b[0m pipeline \u001b[38;5;241m=\u001b[39m build_pipeline(preprocessor, base_model)\n\u001b[1;32m--> 256\u001b[0m scores \u001b[38;5;241m=\u001b[39m cross_val_score(pipeline, X, y, cv\u001b[38;5;241m=\u001b[39mStratifiedKFold(\u001b[38;5;241m3\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m),\n\u001b[0;32m 257\u001b[0m scoring\u001b[38;5;241m=\u001b[39mmake_scorer(f1_score, pos_label\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m scores\u001b[38;5;241m.\u001b[39mmean()\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:712\u001b[0m, in \u001b[0;36mcross_val_score\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[38;5;66;03m# To ensure multimetric format is not supported\u001b[39;00m\n\u001b[0;32m 710\u001b[0m scorer \u001b[38;5;241m=\u001b[39m check_scoring(estimator, scoring\u001b[38;5;241m=\u001b[39mscoring)\n\u001b[1;32m--> 712\u001b[0m cv_results \u001b[38;5;241m=\u001b[39m cross_validate(\n\u001b[0;32m 713\u001b[0m estimator\u001b[38;5;241m=\u001b[39mestimator,\n\u001b[0;32m 714\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 715\u001b[0m y\u001b[38;5;241m=\u001b[39my,\n\u001b[0;32m 716\u001b[0m groups\u001b[38;5;241m=\u001b[39mgroups,\n\u001b[0;32m 717\u001b[0m scoring\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscore\u001b[39m\u001b[38;5;124m\"\u001b[39m: scorer},\n\u001b[0;32m 718\u001b[0m cv\u001b[38;5;241m=\u001b[39mcv,\n\u001b[0;32m 719\u001b[0m n_jobs\u001b[38;5;241m=\u001b[39mn_jobs,\n\u001b[0;32m 720\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[0;32m 721\u001b[0m fit_params\u001b[38;5;241m=\u001b[39mfit_params,\n\u001b[0;32m 722\u001b[0m params\u001b[38;5;241m=\u001b[39mparams,\n\u001b[0;32m 723\u001b[0m pre_dispatch\u001b[38;5;241m=\u001b[39mpre_dispatch,\n\u001b[0;32m 724\u001b[0m error_score\u001b[38;5;241m=\u001b[39merror_score,\n\u001b[0;32m 725\u001b[0m )\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cv_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:423\u001b[0m, in \u001b[0;36mcross_validate\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)\u001b[0m\n\u001b[0;32m 420\u001b[0m \u001b[38;5;66;03m# We clone the estimator to make sure that all the folds are\u001b[39;00m\n\u001b[0;32m 421\u001b[0m \u001b[38;5;66;03m# independent, and that it is pickle-able.\u001b[39;00m\n\u001b[0;32m 422\u001b[0m parallel \u001b[38;5;241m=\u001b[39m Parallel(n_jobs\u001b[38;5;241m=\u001b[39mn_jobs, verbose\u001b[38;5;241m=\u001b[39mverbose, pre_dispatch\u001b[38;5;241m=\u001b[39mpre_dispatch)\n\u001b[1;32m--> 423\u001b[0m results \u001b[38;5;241m=\u001b[39m parallel(\n\u001b[0;32m 424\u001b[0m delayed(_fit_and_score)(\n\u001b[0;32m 425\u001b[0m clone(estimator),\n\u001b[0;32m 426\u001b[0m X,\n\u001b[0;32m 427\u001b[0m y,\n\u001b[0;32m 428\u001b[0m scorer\u001b[38;5;241m=\u001b[39mscorers,\n\u001b[0;32m 429\u001b[0m train\u001b[38;5;241m=\u001b[39mtrain,\n\u001b[0;32m 430\u001b[0m test\u001b[38;5;241m=\u001b[39mtest,\n\u001b[0;32m 431\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[0;32m 432\u001b[0m parameters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 433\u001b[0m fit_params\u001b[38;5;241m=\u001b[39mrouted_params\u001b[38;5;241m.\u001b[39mestimator\u001b[38;5;241m.\u001b[39mfit,\n\u001b[0;32m 434\u001b[0m score_params\u001b[38;5;241m=\u001b[39mrouted_params\u001b[38;5;241m.\u001b[39mscorer\u001b[38;5;241m.\u001b[39mscore,\n\u001b[0;32m 435\u001b[0m return_train_score\u001b[38;5;241m=\u001b[39mreturn_train_score,\n\u001b[0;32m 436\u001b[0m return_times\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[0;32m 437\u001b[0m return_estimator\u001b[38;5;241m=\u001b[39mreturn_estimator,\n\u001b[0;32m 438\u001b[0m error_score\u001b[38;5;241m=\u001b[39merror_score,\n\u001b[0;32m 439\u001b[0m )\n\u001b[0;32m 440\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m train, test \u001b[38;5;129;01min\u001b[39;00m indices\n\u001b[0;32m 441\u001b[0m )\n\u001b[0;32m 443\u001b[0m _warn_or_raise_about_fit_failures(results, error_score)\n\u001b[0;32m 445\u001b[0m \u001b[38;5;66;03m# For callable scoring, the return type is only know after calling. If the\u001b[39;00m\n\u001b[0;32m 446\u001b[0m \u001b[38;5;66;03m# return type is a dictionary, the error scores can now be inserted with\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[38;5;66;03m# the correct key.\u001b[39;00m\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m(iterable_with_config)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output)\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py:136\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 134\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:888\u001b[0m, in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)\u001b[0m\n\u001b[0;32m 886\u001b[0m estimator\u001b[38;5;241m.\u001b[39mfit(X_train, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m 887\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 888\u001b[0m estimator\u001b[38;5;241m.\u001b[39mfit(X_train, y_train, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m 890\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 891\u001b[0m \u001b[38;5;66;03m# Note fit time as time until error\u001b[39;00m\n\u001b[0;32m 892\u001b[0m fit_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py:329\u001b[0m, in \u001b[0;36mPipeline.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 285\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model.\u001b[39;00m\n\u001b[0;32m 286\u001b[0m \n\u001b[0;32m 287\u001b[0m \u001b[38;5;124;03mFit all the transforms/samplers one after the other and\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 326\u001b[0m \u001b[38;5;124;03m This estimator.\u001b[39;00m\n\u001b[0;32m 327\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 328\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 329\u001b[0m Xt, yt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit(X, y, routed_params)\n\u001b[0;32m 330\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n\u001b[0;32m 331\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassthrough\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py:265\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 255\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m fit_transform_one_cached(\n\u001b[0;32m 256\u001b[0m cloned_transformer,\n\u001b[0;32m 257\u001b[0m X,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 262\u001b[0m params\u001b[38;5;241m=\u001b[39mrouted_params[name],\n\u001b[0;32m 263\u001b[0m )\n\u001b[0;32m 264\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(cloned_transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_resample\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m--> 265\u001b[0m X, y, fitted_transformer \u001b[38;5;241m=\u001b[39m fit_resample_one_cached(\n\u001b[0;32m 266\u001b[0m cloned_transformer,\n\u001b[0;32m 267\u001b[0m X,\n\u001b[0;32m 268\u001b[0m y,\n\u001b[0;32m 269\u001b[0m message_clsname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 270\u001b[0m message\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(step_idx),\n\u001b[0;32m 271\u001b[0m params\u001b[38;5;241m=\u001b[39mrouted_params[name],\n\u001b[0;32m 272\u001b[0m )\n\u001b[0;32m 273\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 274\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 275\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 276\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunc(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py:1057\u001b[0m, in \u001b[0;36m_fit_resample_one\u001b[1;34m(sampler, X, y, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1055\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_fit_resample_one\u001b[39m(sampler, X, y, message_clsname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m, message\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, params\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m 1056\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[1;32m-> 1057\u001b[0m X_res, y_res \u001b[38;5;241m=\u001b[39m sampler\u001b[38;5;241m.\u001b[39mfit_resample(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_resample\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\n\u001b[0;32m 1059\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X_res, y_res, sampler\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_resample(X, y)\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\combine\\_smote_enn.py:161\u001b[0m, in \u001b[0;36mSMOTEENN._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 158\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy\n\u001b[0;32m 160\u001b[0m X_res, y_res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmote_\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n\u001b[1;32m--> 161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menn_\u001b[38;5;241m.\u001b[39mfit_resample(X_res, y_res)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_resample(X, y)\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\under_sampling\\_prototype_selection\\_edited_nearest_neighbours.py:168\u001b[0m, in \u001b[0;36mEditedNearestNeighbours._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 166\u001b[0m X_class \u001b[38;5;241m=\u001b[39m _safe_indexing(X, target_class_indices)\n\u001b[0;32m 167\u001b[0m y_class \u001b[38;5;241m=\u001b[39m _safe_indexing(y, target_class_indices)\n\u001b[1;32m--> 168\u001b[0m nnhood_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnn_\u001b[38;5;241m.\u001b[39mkneighbors(X_class, return_distance\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)[:, \u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m 169\u001b[0m nnhood_label \u001b[38;5;241m=\u001b[39m y[nnhood_idx]\n\u001b[0;32m 170\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkind_sel \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmode\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\neighbors\\_base.py:849\u001b[0m, in \u001b[0;36mKNeighborsMixin.kneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m 842\u001b[0m use_pairwise_distances_reductions \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 843\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbrute\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 844\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m ArgKmin\u001b[38;5;241m.\u001b[39mis_usable_for(\n\u001b[0;32m 845\u001b[0m X \u001b[38;5;28;01mif\u001b[39;00m X \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_X, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_X, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meffective_metric_\n\u001b[0;32m 846\u001b[0m )\n\u001b[0;32m 847\u001b[0m )\n\u001b[0;32m 848\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_pairwise_distances_reductions:\n\u001b[1;32m--> 849\u001b[0m results \u001b[38;5;241m=\u001b[39m ArgKmin\u001b[38;5;241m.\u001b[39mcompute(\n\u001b[0;32m 850\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 851\u001b[0m Y\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_X,\n\u001b[0;32m 852\u001b[0m k\u001b[38;5;241m=\u001b[39mn_neighbors,\n\u001b[0;32m 853\u001b[0m metric\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meffective_metric_,\n\u001b[0;32m 854\u001b[0m metric_kwargs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meffective_metric_params_,\n\u001b[0;32m 855\u001b[0m strategy\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 856\u001b[0m return_distance\u001b[38;5;241m=\u001b[39mreturn_distance,\n\u001b[0;32m 857\u001b[0m )\n\u001b[0;32m 859\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m (\n\u001b[0;32m 860\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbrute\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetric \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprecomputed\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m issparse(X)\n\u001b[0;32m 861\u001b[0m ):\n\u001b[0;32m 862\u001b[0m results \u001b[38;5;241m=\u001b[39m _kneighbors_from_graph(\n\u001b[0;32m 863\u001b[0m X, n_neighbors\u001b[38;5;241m=\u001b[39mn_neighbors, return_distance\u001b[38;5;241m=\u001b[39mreturn_distance\n\u001b[0;32m 864\u001b[0m )\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_pairwise_distances_reduction\\_dispatcher.py:278\u001b[0m, in \u001b[0;36mArgKmin.compute\u001b[1;34m(cls, X, Y, k, metric, chunk_size, metric_kwargs, strategy, return_distance)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute the argkmin reduction.\u001b[39;00m\n\u001b[0;32m 198\u001b[0m \n\u001b[0;32m 199\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 275\u001b[0m \u001b[38;5;124;03mreturns.\u001b[39;00m\n\u001b[0;32m 276\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m Y\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat64:\n\u001b[1;32m--> 278\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ArgKmin64\u001b[38;5;241m.\u001b[39mcompute(\n\u001b[0;32m 279\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 280\u001b[0m Y\u001b[38;5;241m=\u001b[39mY,\n\u001b[0;32m 281\u001b[0m k\u001b[38;5;241m=\u001b[39mk,\n\u001b[0;32m 282\u001b[0m metric\u001b[38;5;241m=\u001b[39mmetric,\n\u001b[0;32m 283\u001b[0m chunk_size\u001b[38;5;241m=\u001b[39mchunk_size,\n\u001b[0;32m 284\u001b[0m metric_kwargs\u001b[38;5;241m=\u001b[39mmetric_kwargs,\n\u001b[0;32m 285\u001b[0m strategy\u001b[38;5;241m=\u001b[39mstrategy,\n\u001b[0;32m 286\u001b[0m return_distance\u001b[38;5;241m=\u001b[39mreturn_distance,\n\u001b[0;32m 287\u001b[0m )\n\u001b[0;32m 289\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m Y\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat32:\n\u001b[0;32m 290\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ArgKmin32\u001b[38;5;241m.\u001b[39mcompute(\n\u001b[0;32m 291\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 292\u001b[0m Y\u001b[38;5;241m=\u001b[39mY,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 298\u001b[0m return_distance\u001b[38;5;241m=\u001b[39mreturn_distance,\n\u001b[0;32m 299\u001b[0m )\n", "File \u001b[1;32msklearn\\\\metrics\\\\_pairwise_distances_reduction\\\\_argkmin.pyx:59\u001b[0m, in \u001b[0;36msklearn.metrics._pairwise_distances_reduction._argkmin.ArgKmin64.compute\u001b[1;34m()\u001b[0m\n", "File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\threadpoolctl.py:592\u001b[0m, in \u001b[0;36m_ThreadpoolLimiter.__exit__\u001b[1;34m(self, type, value, traceback)\u001b[0m\n\u001b[0;32m 589\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__enter__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 590\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m--> 592\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28mtype\u001b[39m, value, traceback):\n\u001b[0;32m 593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_original_limits()\n\u001b[0;32m 595\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[0;32m 596\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap\u001b[39m(\u001b[38;5;28mcls\u001b[39m, controller, \u001b[38;5;241m*\u001b[39m, limits\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, user_api\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mFailed to interrupt the Kernel. \n", "\u001b[1;31mUnable to start Kernel 'base (Python 3.12.7)' due to a timeout waiting for the ports to get used. \n", "\u001b[1;31mView Jupyter log for further details." ] } ], "source": [ "status_prediction_model(consolidated_clean.to_pandas())" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import joblib\n", "import numpy as np\n", "import pandas as pd\n", "import shap\n", "import matplotlib.pyplot as plt\n", "import scipy.sparse\n", "\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.pipeline import Pipeline as SKPipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold\n", "from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold\n", "from sklearn.metrics import classification_report, ConfusionMatrixDisplay, f1_score, make_scorer\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn.calibration import CalibratedClassifierCV\n", "from sklearn.ensemble import IsolationForest\n", "\n", "from imblearn.pipeline import Pipeline as ImbPipeline\n", "from imblearn.combine import SMOTEENN\n", "\n", "from sentence_transformers import SentenceTransformer\n", "from xgboost import XGBClassifier\n", "\n", "from evidently import Report\n", "from evidently.presets import DataDriftPreset\n", "\n", "import optuna\n", "\n", "\n", "# --- Custom Transformers ---\n", "class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):\n", " def fit(self, X,y=None):\n", " self.col = X.name\n", " self.mlb = MultiLabelBinarizer()\n", " self.mlb.fit(X)\n", " return self\n", " def transform(self, X):\n", " return self.mlb.transform(X)\n", " def get_feature_names_out(self, input_features=None):\n", " return [f\"{self.col}_{cls}\" for cls in self.mlb.classes_]\n", " def get_params(self, deep=True):\n", " return {}\n", " def set_params(self, **params):\n", " return self\n", "\n", "class AnomalyScoreTransformer(BaseEstimator, TransformerMixin):\n", " def __init__(self):\n", " self.model = IsolationForest(n_estimators=200, contamination=0.1, random_state=42)\n", "\n", " def fit(self, X, y=None):\n", " self.model.fit(X)\n", " return self\n", "\n", " def transform(self, X):\n", " scores = -self.model.decision_function(X)\n", " return np.hstack([X, scores.reshape(-1, 1)])\n", "\n", "# --- Step 1: Data Preparation ---\n", "def prepare_data(df, is_train=True, model_dir=\"model_artifacts\"):\n", " df = df.copy()\n", " \n", " if is_train:\n", " df['status'] = df['status'].astype(str).str.upper()\n", " df = df[df['status'].isin(['CLOSED', 'TERMINATED'])]\n", " df['label'] = df['status'].map({'CLOSED': 0, 'TERMINATED': 1})\n", " assert df['label'].notna().all(), \"Label column still has NaNs!\"\n", "\n", " multilabel_fields = [\n", " 'list_country', 'list_activityType', 'list_deliverableType',\n", " 'list_availableLanguages', 'list_euroSciVocTitle'\n", " ]\n", "\n", " def extract_intermediate_levels(paths):\n", " tokens = []\n", " if isinstance(paths, list):\n", " for p in paths:\n", " parts = p.strip('/').split('/')\n", " tokens.extend(parts[:-1])\n", " return list(set(tokens))\n", " df['euroSciVoc_intermediate'] = df['list_euroSciVocPath'].apply(extract_intermediate_levels)\n", " multilabel_fields.append('euroSciVoc_intermediate')\n", " \n", " for col in multilabel_fields:\n", " df[col] = df[col].apply(lambda x: [] if x is None else (x.tolist() if hasattr(x, 'tolist') else x))\n", " df[col] = df[col].apply(lambda x: list(x) if not isinstance(x, list) else x)\n", " df[col] = df[col].apply(lambda x: [item for item in x if item is not None])\n", " df[col] = df[col].apply(lambda x: [str(item).upper() for item in x])\n", "\n", " \n", " def split_languages(lang_list):\n", " if not isinstance(lang_list, list):\n", " return []\n", " result = []\n", " for entry in lang_list:\n", " if isinstance(entry, str):\n", " result.extend(entry.split(\",\"))\n", " return result\n", "\n", " df[\"list_availableLanguages\"] = df[\"list_availableLanguages\"].apply(split_languages)\n", " \n", " df['topic_title'] = df['list_title_topic'].apply(\n", " lambda x: (x.tolist() if hasattr(x, 'tolist') else x)[0] if x is not None and len(x) > 0 else \"unknown_topic\"\n", " )\n", "\n", " for col in ['title', 'objective', 'topic_title']:\n", " df[col] = df[col].fillna(\"\").astype(str)\n", "\n", " df['n_partners'] = df['list_name'].apply(\n", " lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)\n", " )\n", "\n", " df['n_country'] = df['list_country'].apply(\n", " lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)\n", " )\n", "\n", " df['n_sme'] = df['list_SME'].apply(\n", " lambda x: sum(1 for i in (x.tolist() if hasattr(x, 'tolist') else x) if i is True)\n", " if x is not None and (hasattr(x, 'tolist') or isinstance(x, list)) else 0\n", " )\n", "\n", " return df\n", "\n", "# --- Step 2: Text Embedding ---\n", "def compute_embeddings(df, text_columns, model_name='paraphrase-multilingual-MiniLM-L12-v2', svd_dim=50):\n", " model = SentenceTransformer(model_name)\n", " os.makedirs(\"model_artifacts\", exist_ok=True)\n", " os.makedirs(\"embeddings\", exist_ok=True)\n", " for col in text_columns:\n", " embedding_file = f\"embeddings/{col}_embeddings.npy\"\n", " svd_file = f\"model_artifacts/{col}_svd.pkl\"\n", " if os.path.exists(embedding_file):\n", " print(f\"Loading saved embeddings for column '{col}'...\")\n", " embeddings = np.load(embedding_file)\n", " else:\n", " print(f\"Computing embeddings for column '{col}'...\")\n", " embeddings = model.encode(df[col].tolist(), show_progress_bar=True)\n", " np.save(embedding_file, embeddings)\n", "\n", " print(f\"Fitting SVD for column '{col}'...\")\n", " svd = TruncatedSVD(n_components=svd_dim, random_state=42)\n", " svd.fit(embeddings)\n", " joblib.dump(svd, svd_file)\n", "\n", " reduced = svd.transform(embeddings)\n", " embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])])\n", " embed_df.index = df.index # Force matching index\n", " df = pd.concat([df, embed_df], axis=1)\n", " return df\n", "\n", "\n", "# --- Step 3: Build Preprocessor ---\n", "def build_preprocessor(numeric_features, categorical_features, multilabel_fields):\n", " numeric_pipeline = SKPipeline([\n", " ('imputer', SimpleImputer(strategy='median')),\n", " ('anomaly', AnomalyScoreTransformer()),\n", " ('scaler', StandardScaler())\n", " ])\n", "\n", " categorical_pipeline = SKPipeline([\n", " ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n", " ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", " ])\n", "\n", " transformers = [\n", " ('num', numeric_pipeline, numeric_features),\n", " ('cat', categorical_pipeline, categorical_features),\n", " *[(f'mlb_{col}', MultiLabelBinarizerTransformer(), col) for col in multilabel_fields]]\n", " \n", "\n", " return ColumnTransformer(transformers, sparse_threshold=0.0)\n", "\n", "# --- Step 4: Build Pipeline ---\n", "def build_pipeline(preprocessor, base_model, k=250):\n", " return ImbPipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('anomaly', AnomalyScoreTransformer()),\n", " ('resample', SMOTEENN()),\n", " (\"variance_filter\", VarianceThreshold(threshold=0.0)),\n", " ('feature_select', SelectKBest(score_func=f_classif, k=k)),\n", " ('classifier', CalibratedClassifierCV(estimator=base_model, method='isotonic', cv=3))\n", " ])\n", "\n", "# --- Step 5: Drift Monitoring ---\n", "def monitor_drift(reference, current, feature_names, output_html='drift_report.html'):\n", " ref_df = pd.DataFrame(reference, columns=feature_names)\n", " cur_df = pd.DataFrame(current, columns=feature_names)\n", " \n", " report = Report(metrics=[DataDriftPreset()])\n", " report.run(reference_data=ref_df, current_data=cur_df)\n", " report.save_html(output_html)\n", " print(f\"✅ Drift report saved to {output_html}\")\n", "\n", "\n", "# --- Step 6: Evaluation + SHAP ---\n", "def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names):\n", " model.fit(X_train, y_train)\n", " y_pred = model.predict(X_test)\n", " print(classification_report(y_test, y_pred))\n", " ConfusionMatrixDisplay.from_predictions(y_test, y_pred)\n", " plt.title(\"Evaluation\")\n", " plt.tight_layout()\n", " plt.show()\n", "\n", " X_proc = model.named_steps['preprocessor'].transform(X_test)\n", " if scipy.sparse.issparse(X_proc):\n", " X_proc = X_proc.toarray()\n", "\n", " selector = model.named_steps['feature_select']\n", " X_selected = selector.transform(X_proc)\n", "\n", " explainer = shap.Explainer(model.named_steps['classifier'].base_estimator, feature_names=feature_names)\n", " shap_values = explainer(X_selected)\n", " shap.summary_plot(shap_values, X_selected)\n", "\n", "# --- Final Orchestration ---\n", "def status_prediction_model(df):\n", " os.makedirs(\"model_artifacts\", exist_ok=True)\n", " print(\"🧹 Preparing data...\")\n", " df = prepare_data(df, is_train=True)\n", " print(\"💡 Embedding text...\")\n", " df = compute_embeddings(df, ['title', 'objective', 'topic_title'])\n", "\n", " text_embed_cols = [col for col in df.columns if '_embed_' in col]\n", " numeric_features = ['durationDays', 'startYear', 'ecMaxContribution', 'totalCost',\n", " 'n_partners', 'n_country', 'n_sme'] + text_embed_cols\n", " categorical_features = ['frameworkProgramme', 'fundingScheme', 'legalBasis', 'nature']\n", " multilabel_fields = ['list_country', 'list_activityType', 'list_deliverableType',\n", " 'list_availableLanguages', 'list_euroSciVocTitle','euroSciVoc_intermediate']\n", " \n", " \n", " df = df[numeric_features + categorical_features + multilabel_fields + ['label']]\n", " X = df.drop(columns='label')\n", " y = df['label']\n", "\n", "\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)\n", "\n", " print(\"🧱 Building pipeline...\")\n", " preprocessor = build_preprocessor(numeric_features, categorical_features, multilabel_fields)\n", " base_model = XGBClassifier(eval_metric='logloss', n_jobs=-1)\n", "\n", " print(\"🎯 Training model with Optuna...\")\n", " def objective(trial):\n", " params = {\n", " 'n_estimators': trial.suggest_int('n_estimators', 100, 300),\n", " 'max_depth': trial.suggest_int('max_depth', 3, 10),\n", " 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),\n", " 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2.0, 10.0)\n", " }\n", " base_model.set_params(**params)\n", " pipeline = build_pipeline(preprocessor, base_model)\n", " scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(3, shuffle=True, random_state=42),\n", " scoring=make_scorer(f1_score, pos_label=1))\n", " return scores.mean()\n", "\n", " study = optuna.create_study(direction='maximize')\n", " study.optimize(objective, n_trials=10)\n", " best_params = study.best_trial.params\n", " base_model.set_params(**best_params)\n", "\n", " print(\"✅ Training final model and evaluating...\")\n", " final_pipeline = build_pipeline(preprocessor, base_model)\n", " selector = final_pipeline.named_steps['feature_select']\n", " if hasattr(selector, 'get_support'):\n", " feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())[selector.get_support()]\n", " else:\n", " feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())\n", " evaluate_model(final_pipeline, X_train, X_test, y_train, y_test, feature_names)\n", "\n", " print(\"📊 Monitoring drift...\")\n", " ref_data = preprocessor.transform(X_train)\n", " cur_data = preprocessor.transform(X_test)\n", " if scipy.sparse.issparse(ref_data): ref_data = ref_data.toarray()\n", " if scipy.sparse.issparse(cur_data): cur_data = cur_data.toarray()\n", " monitor_drift(pd.DataFrame(ref_data), pd.DataFrame(cur_data), feature_names)\n", " print(\"💾 Saving model and artifacts...\")\n", " joblib.dump(final_pipeline, \"model_artifacts/model.pkl\")\n", " joblib.dump(preprocessor, \"model_artifacts/preprocessor.pkl\")\n", " X_train.to_csv(\"model_artifacts/X_train_processed.csv\", index=False)\n", " y_train.to_csv(\"model_artifacts/y_train.csv\", index=False)\n", " feature_config = {\n", " \"numeric_features\": numeric_features,\n", " \"categorical_features\": categorical_features,\n", " \"multilabel_fields\": multilabel_fields\n", " }\n", " json.dump(feature_config, open(\"model_artifacts/feature_config.json\", \"w\"))\n", " print(\"✅ Training complete. Model artifacts saved.\")\n", "\n", "def score(new_data_df, model_dir=\"model_artifacts\"):\n", " # Load saved artifacts\n", " model = joblib.load(os.path.join(model_dir, \"model.pkl\"))\n", " config = json.load(open(os.path.join(model_dir, \"feature_config.json\")))\n", " text_cols = ['title', 'objective', 'topic_title']\n", " numeric_features = config[\"numeric_features\"]\n", " categorical_features = config[\"categorical_features\"]\n", " multilabel_fields = config[\"multilabel_fields\"]\n", " new_data_df = prepare_data(new_data_df, is_train=False)\n", " # Text embedding using saved SVDs\n", " sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')\n", " for col in text_cols:\n", " svd = joblib.load(os.path.join(model_dir, f\"{col}_svd.pkl\"))\n", " emb = sbert.encode(new_data_df[col].tolist(), show_progress_bar=False)\n", " reduced = svd.transform(emb)\n", " embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])])\n", " df = pd.concat([df, embed_df], axis=1)\n", "\n", " # Final input set\n", " final_X = new_data_df[numeric_features + categorical_features + multilabel_fields]\n", " pred = model.predict(final_X)\n", " prob = model.predict_proba(final_X)\n", "\n", " return pred, prob\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.cluster import KMeans\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report\n", "import pandas as pd\n", "\n", "def plot_project_duration_distribution(df):\n", " if 'durationDays' in df.columns:\n", " data = pd.to_numeric(df['durationDays'], errors='coerce').dropna()\n", " plt.figure(figsize=(10,6))\n", " sns.histplot(data, bins=50)\n", " plt.title('Distribution of Project Durations (days)')\n", " plt.xlabel('Duration (days)')\n", " plt.ylabel('Number of Projects')\n", " plt.show()\n", " else:\n", " print(\"Column 'durationDays' not found in DataFrame.\")\n", "\n", "def plot_ec_contribution_by_year(df):\n", " if 'startYear' in df.columns and 'ecMaxContribution' in df.columns:\n", " df['startYear'] = pd.to_numeric(df['startYear'], errors='coerce')\n", " df['ecMaxContribution'] = pd.to_numeric(df['ecMaxContribution'], errors='coerce')\n", " yearly_funding = df.groupby('startYear')['ecMaxContribution'].sum().dropna()\n", " plt.figure(figsize=(10,6))\n", " yearly_funding.plot(kind='bar')\n", " plt.title('Total EC Max Contribution by Start Year')\n", " plt.ylabel('Total Contribution (€)')\n", " plt.xlabel('Start Year')\n", " plt.xticks(rotation=45)\n", " plt.tight_layout()\n", " plt.show()\n", " else:\n", " print(\"Required columns not found in DataFrame.\")\n", "\n", "def plot_participation_by_country(df):\n", " if 'list_country' in df.columns:\n", " countries = df['list_country'].explode()\n", " countries = countries.dropna()\n", " top_countries = countries.value_counts().head(15)\n", " plt.figure(figsize=(10,6))\n", " top_countries.plot(kind='bar')\n", " plt.title('Top 15 Countries by Project Participation')\n", " plt.ylabel('Number of Participations')\n", " plt.xticks(rotation=45)\n", " plt.tight_layout()\n", " plt.show()\n", " else:\n", " print(\"Column 'list_country' not found in DataFrame.\")\n", "\n", "def success_prediction_model(df):\n", " df = df.copy()\n", "\n", " # Define binary target variable\n", " df['target'] = df['status'].apply(lambda x: 1 if str(x).upper() == 'CLOSED' else 0)\n", "\n", " # Feature selection\n", " features = ['durationDays', 'ecMaxContribution', 'netEcContribution', 'startYear', 'endYear', 'title', 'objective']\n", " df = df[features + ['target']].dropna(subset=['target'])\n", "\n", " # Ensure selected features exist\n", " for col in features:\n", " if col not in df.columns:\n", " print(f\"Missing expected column: {col}\")\n", " return\n", "\n", " # Fill missing text with empty string\n", " df['title'] = df['title'].fillna('').astype(str)\n", " df['objective'] = df['objective'].fillna('').astype(str)\n", "\n", " numeric_features = ['durationDays', 'ecMaxContribution', 'netEcContribution', 'startYear', 'endYear']\n", " text_features_title = 'title'\n", " text_features_objective = 'objective'\n", "\n", " preprocessor = ColumnTransformer(transformers=[\n", " ('num', Pipeline([\n", " ('imputer', SimpleImputer(strategy='median')),\n", " ('scaler', StandardScaler())\n", " ]), numeric_features),\n", " ('title_tfidf', TfidfVectorizer(max_features=100), text_features_title),\n", " ('objective_tfidf', TfidfVectorizer(max_features=100), text_features_objective),\n", " ])\n", "\n", " clf = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('classifier', LogisticRegression(max_iter=1000))\n", " ])\n", "\n", " X = df[features]\n", " y = df['target']\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", "\n", " clf.fit(X_train, y_train)\n", " y_pred = clf.predict(X_test)\n", "\n", " print(\"✅ Classification Report:\")\n", " print(classification_report(y_test, y_pred))\n", " \n", "\n", "def cluster_projects(df, n_clusters=5):\n", " df = df.copy()\n", " features = ['durationDays', 'ecMaxContribution', 'netEcContribution']\n", " df = df[features].dropna()\n", " \n", " df[features] = df[features].apply(pd.to_numeric, errors='coerce')\n", " df = df.dropna()\n", "\n", " scaler = StandardScaler()\n", " scaled_features = scaler.fit_transform(df)\n", "\n", " kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n", " clusters = kmeans.fit_predict(scaled_features)\n", "\n", " plt.figure(figsize=(10,6))\n", " plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=clusters, cmap='viridis')\n", " plt.title('Project Clusters')\n", " plt.xlabel('Scaled Duration')\n", " plt.ylabel('Scaled EC Contribution')\n", " plt.show()\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_17684\\589813601.py:6: DeprecationWarning: `GroupBy.count` is deprecated. It has been renamed to `len`.\n", " .count()\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "countries = (\n", " consolidated\n", " .filter(pl.col(\"status\").is_not_null())\n", " .group_by(\"status\")\n", " .count()\n", " .rename({\"count\": \"project_count\"})\n", " .sort(\"project_count\", descending=True)\n", ")\n", "\n", "country_names = countries[\"status\"].to_list()[:30]\n", "project_counts = countries[\"project_count\"].to_list()[:30]\n", "\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(country_names, project_counts)\n", "plt.title(\"Projects by Country (Top 30)\")\n", "plt.xlabel(\"Country\")\n", "plt.ylabel(\"Number of Projects\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_2396\\478720037.py:82: DeprecationWarning: `GroupBy.count` is deprecated. It has been renamed to `len`.\n", " .count()\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import polars as pl\n", "import matplotlib.pyplot as plt\n", "import pathlib\n", "from itertools import combinations\n", "import networkx as nx\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.cluster import KMeans\n", "from sklearn.manifold import TSNE\n", "import folium\n", "\n", "# Paths\n", "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n", "OUTDIR = ROOT / \"combined\"\n", "PARQUET = OUTDIR / \"consolidated.parquet\"\n", "PROJECT_PUB = OUTDIR / \"projectPublications_all.parquet\"\n", "\n", "# Load consolidated dataset\n", "df = pl.read_parquet(PARQUET)\n", "\n", "for col in [\"startDate\", \"endDate\"]:\n", " if df[col].dtype == pl.Utf8:\n", " df = df.with_column(\n", " pl.col(col).str.strptime(pl.Date, \"%Y-%m-%d\").alias(col)\n", " )\n", "\n", "df = df.with_columns(\n", " pl.col(\"list_netEcContribution\").list.eval(pl.element().cast(pl.Float64),parallel=True)\n", " .list.sum().alias(\"netEcContribution\")\n", ")\n", "\n", "df = df.with_columns(\n", " pl.col(\"totalCost\").cast(pl.Float64),\n", " pl.col(\"netEcContribution\").cast(pl.Float64)\n", ")\n", "\n", "df = df.with_columns([\n", " pl.col(\"startDate\").dt.year().alias(\"startYear\"),\n", " pl.col(\"endDate\"). dt.year().alias(\"endYear\"),\n", " (pl.col(\"endDate\") - pl.col(\"startDate\")).dt.total_days().alias(\"durationDays\"),\n", " (pl.col(\"netEcContribution\") / pl.col(\"totalCost\")).alias(\"ecRatio\"),\n", "])\n", "\n", "# 1. Histogram of project start years\n", "years = [y for y in df[\"startYear\"].to_list() if y is not None]\n", "plt.figure()\n", "plt.hist(years, bins=range(min(years), max(years) + 1))\n", "plt.title(\"Projects by Start Year\")\n", "plt.xlabel(\"Year\")\n", "plt.ylabel(\"Count\")\n", "plt.show()\n", "\n", "# 2. Histogram of project end years\n", "end_years = [y for y in df[\"endYear\"].to_list() if y is not None]\n", "plt.figure()\n", "plt.hist(end_years, bins=range(min(end_years), max(end_years) + 1))\n", "plt.title(\"Projects by End Year\")\n", "plt.xlabel(\"Year\")\n", "plt.ylabel(\"Count\")\n", "plt.show()\n", "\n", "# 3. Duration vs. Total Cost scatter plot\n", "valid = df.filter(\n", " pl.col(\"durationDays\").is_not_null() &\n", " pl.col(\"totalCost\"). is_not_null()\n", ")\n", "durations = valid[\"durationDays\"].to_list()\n", "costs = valid[\"totalCost\"]. to_list()\n", "plt.figure()\n", "plt.scatter(durations, costs)\n", "plt.title(\"Project Duration vs. Total Cost\")\n", "plt.xlabel(\"Duration (days)\")\n", "plt.ylabel(\"Total Cost\")\n", "plt.show()\n", "\n", "\n", "# 4. Country breakdown (first country in list_country)\n", "countries = (\n", " df\n", " .explode(\"list_country\")\n", " .filter(pl.col(\"list_country\").is_not_null())\n", " .group_by(\"list_country\")\n", " .count()\n", " .rename({\"count\": \"project_count\"})\n", " .sort(\"project_count\", descending=True)\n", ")\n", "\n", "country_names = countries[\"list_country\"].to_list()[:30]\n", "project_counts = countries[\"project_count\"].to_list()[:30]\n", "\n", "plt.figure(figsize=(10, 6))\n", "plt.bar(country_names, project_counts)\n", "plt.title(\"Projects by Country (Top 30)\")\n", "plt.xlabel(\"Country\")\n", "plt.ylabel(\"Number of Projects\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. Geo scatter (parse \"lat, lon\" from list_geolocation)\n", "locs = (\n", " df\n", " .explode(\"list_geolocation\")\n", " .filter(pl.col(\"list_geolocation\").is_not_null())\n", ")\n", "\n", "locs = locs.with_columns(\n", " pl.col(\"list_geolocation\")\n", " .str.split(\",\")\n", " .alias(\"geo_split\")\n", ")\n", "\n", "locs = locs.with_columns([\n", " pl.col(\"geo_split\")\n", " .list.get(0) \n", " .str.strip_chars().str.replace(r\"[^\\d\\.\\-]\", \"\", literal=False)\n", " .cast(pl.Float64) \n", " .alias(\"lat\"),\n", " pl.col(\"geo_split\")\n", " .list.get(1)\n", " .str.strip_chars().str.replace(r\"[^\\d\\.\\-]\", \"\", literal=False)\n", " .cast(pl.Float64)\n", " .alias(\"lon\"),\n", "])\n", "\n", "locs = locs.filter(pl.col(\"lat\").is_not_null() & pl.col(\"lon\").is_not_null())\n", "\n", "lats = locs[\"lat\"].to_list()\n", "lons = locs[\"lon\"].to_list()\n", "\n", "center_lat = sum(lats) / len(lats)\n", "center_lon = sum(lons) / len(lons)\n", "\n", "m = folium.Map(location=[center_lat, center_lon], zoom_start=4, tiles=\"CartoDB positron\")\n", "\n", "for lat, lon in zip(lats, lons):\n", " folium.CircleMarker(\n", " location=[lat, lon],\n", " radius=3,\n", " fill=True,\n", " fill_opacity=0.6\n", " ).add_to(m)\n", "\n", "m" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_14980\\3443882100.py:59: DeprecationWarning: `GroupBy.count` is deprecated. It has been renamed to `len`.\n", " .count()\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAGGCAYAAADmRxfNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAACJ0klEQVR4nOzdeVxN+f8H8NctrVotJbRqUYQwjDXZZRnb2GUfxi7L5GuEGNuMdTQYS2Vfss9QqCbrEFFZkqWEUVGSilL3/v7wcH9z5xaR65yr1/PxuI9H93NO9eoo977PZ5PIZDIZiIiIiIiISkFD6ABERERERKT+WFgQEREREVGpsbAgIiIiIqJSY2FBRERERESlxsKCiIiIiIhKjYUFERERERGVGgsLIiIiIiIqNRYWRERERERUaiwsiIiIiIio1FhYEBGVcX/99RckEgn++usvedvQoUNhY2MjWKaSkkgkGD9+/HvPCwwMhEQiQVJSkupDERGVUSwsiIjUwNs3xm8furq6qFq1Kjp06IDVq1fjxYsXQkcsM2xsbBT+LYp7BAYGCh2ViOizKid0ACIiKjk/Pz/Y2tri9evXSElJwV9//YXJkydj+fLlOHz4MOrUqfNJvs+GDRsglUo/ydf60qxcuRLZ2dny50ePHsXOnTuxYsUKVKpUSd7etGlTIeIREQmGhQURkRrp1KkTGjZsKH8+c+ZMhIeHo0uXLujWrRtu3rwJPT29Un8fLS2tUn+NDyWVSpGfnw9dXd3P/r0/RPfu3RWep6SkYOfOnejevbtaDB8jIlIVDoUiIlJzrVu3xuzZs3H//n1s27ZN4Vh8fDx69+6NChUqQFdXFw0bNsThw4ff+zX/Pcfi9evXqFChAoYNG6Z0XlZWFnR1dTFt2jR5W15eHubMmQN7e3vo6OjA0tISM2bMQF5ensLnvp0fsX37dtSqVQs6OjoICQkBAPzyyy9o2rQpKlasCD09PTRo0ADBwcHF5t2+fTucnJygq6uLBg0a4NSpU+/9GQHg2LFjaNGiBcqXLw9DQ0N07twZ169fL9HnFmfOnDnQ0tLCkydPlI599913MDExwatXrwC8GVbVpUsXHD9+HPXq1YOuri5cXFywf/9+pc/NzMzE5MmTYWlpCR0dHdjb22PJkiVKPUu7du1CgwYNYGhoCCMjI7i6umLVqlWl+pmIiEqChQUR0Rdg8ODBAIDjx4/L265fv46vv/4aN2/ehI+PD5YtW4by5cuje/fuOHDgQIm/tpaWFnr06IGDBw8iPz9f4djBgweRl5eHfv36AXjT69CtWzf88ssv6Nq1K3799Vd0794dK1asQN++fZW+dnh4OKZMmYK+ffti1apV8mJm1apVcHNzg5+fHxYuXIhy5crh22+/xZ9//qn0NSIjIzF58mQMGjQIfn5+SE9PR8eOHXHt2rV3/lxbt25F586dYWBggCVLlmD27Nm4ceMGmjdvXqpJ3oMHD0ZBQQF2796t0J6fn4/g4GD06tVLoVfm9u3b6Nu3Lzp16oRFixbJf9YTJ07Iz8nNzYW7uzu2bdsGLy8vrF69Gs2aNcPMmTPh7e0tP+/EiRPo378/TE1NsWTJEixevBitWrXC2bNnP/rnISIqMRkREYleQECADIAsKiqq2HOMjY1lbm5u8udt2rSRubq6yl69eiVvk0qlsqZNm8ocHBzkbRERETIAsoiICHnbkCFDZNbW1vLnoaGhMgCyI0eOKHxPT09PmZ2dnfz51q1bZRoaGrLTp08rnLdu3ToZANnZs2flbQBkGhoasuvXryv9LLm5uQrP8/PzZbVr15a1bt1aoR2ADIDs0qVL8rb79+/LdHV1ZT169JC3vb1+iYmJMplMJnvx4oXMxMRENmrUKIWvl5KSIjM2NlZqf5eff/5Z4WvLZDJZkyZNZI0bN1Y4b//+/UrX2draWgZAtm/fPnnb8+fPZRYWFgr/lvPnz5eVL19elpCQoPA1fXx8ZJqamrLk5GSZTCaTTZo0SWZkZCQrKCgocX4iok+FPRZERF8IAwMD+epQGRkZCA8PR58+ffDixQs8ffoUT58+RXp6Ojp06IDbt2/j0aNHJf7arVu3RqVKlRTuwj979gwnTpxQ6InYu3cvnJ2dUbNmTfn3fPr0KVq3bg0AiIiIUPi67u7ucHFxUfp+/54n8uzZMzx//hwtWrRAdHS00rlNmjRBgwYN5M+trKzwzTffIDQ0FIWFhUX+PCdOnEBmZib69++vkFNTUxONGzdWyvmhvLy8cOHCBdy9e1fetn37dlhaWsLd3V3h3KpVq6JHjx7y50ZGRvDy8sKVK1eQkpIC4M11bdGiBUxNTRXytm3bFoWFhfKhXyYmJsjJyVHo7SAi+lw4eZuI6AuRnZ0NMzMzAMCdO3cgk8kwe/ZszJ49u8jz09LSUK1atRJ97XLlyqFXr17YsWMH8vLyoKOjg/379+P169cKhcXt27dx8+ZNVK5cudjv+W+2trZFnvfHH39gwYIFuHr1qsLcDIlEonSug4ODUpujoyNyc3Px5MkTVKlSRen47du3AUBe8PyXkZFRke0l1bdvX0yePBnbt2+Hr68vnj9/jj/++ANTpkxR+hns7e2V2hwdHQEASUlJqFKlCm7fvo3Y2Nj3XtexY8diz5496NSpE6pVq4b27dujT58+6NixY6l+HiKikmBhQUT0BXj48CGeP38Oe3t7AJBP6J02bRo6dOhQ5Oe8Pbek+vXrh/Xr1+PYsWPo3r079uzZg5o1a6Ju3bryc6RSKVxdXbF8+fIiv4alpaXC86JWsDp9+jS6deuGli1b4rfffoOFhQW0tLQQEBCAHTt2fFDm4ry9Plu3bi2y8ChXrnQvj6ampujSpYu8sAgODkZeXh4GDRr00XnbtWuHGTNmFHn8bSFiZmaGq1evIjQ0FMeOHcOxY8cQEBAALy8vBAUFffTPQ0RUEiwsiIi+AFu3bgUAeRFhZ2cH4M3E67Zt236S79GyZUtYWFhg9+7daN68OcLDwzFr1iyFc2rUqIGYmBi0adOmyN6Fkti3bx90dXURGhoKHR0deXtAQECR57/tffi3hIQE6OvrF3uHv0aNGgDevBH/VNfnv7y8vPDNN98gKioK27dvh5ubG2rVqqV03tvepX9fr4SEBACQT2avUaMGsrOzS5RVW1sbXbt2RdeuXSGVSjF27FisX78es2fP/uBikojoQ3COBRGRmgsPD8f8+fNha2uLgQMHAnjzhrlVq1ZYv349Hj9+rPQ5RS2F+j4aGhro3bs3jhw5gq1bt6KgoEBppac+ffrg0aNH2LBhg9Lnv3z5Ejk5Oe/9PpqampBIJArzI5KSknDw4MEizz9//rzC3IsHDx7g0KFDaN++PTQ1NYv8nA4dOsDIyAgLFy7E69evlY5/zPX5r06dOqFSpUpYsmQJIiMji+2t+OeffxRW6crKysKWLVtQr149eW9Knz59cP78eYSGhip9fmZmJgoKCgAA6enpCsc0NDTkmyb+d7lfIqJPjT0WRERq5NixY4iPj0dBQQFSU1MRHh6OEydOwNraGocPH1ZYxtTf3x/NmzeHq6srRo0aBTs7O6SmpuL8+fN4+PAhYmJiPvj79+3bF7/++ivmzJkDV1dXODs7KxwfPHgw9uzZgzFjxiAiIgLNmjVDYWEh4uPjsWfPHoSGhips8FeUzp07Y/ny5ejYsSMGDBiAtLQ0+Pv7w97eHrGxsUrn165dGx06dMDEiROho6OD3377DQAwb968Yr+HkZER1q5di8GDB6N+/fro168fKleujOTkZPz5559o1qwZ1qxZ88HX59+0tLTQr18/rFmzBpqamujfv3+R5zk6OmLEiBGIioqCubk5Nm/ejNTUVIUemunTp+Pw4cPo0qULhg4digYNGiAnJwdxcXEIDg5GUlISKlWqhJEjRyIjIwOtW7dG9erVcf/+ffz666+oV6+e0r8VEdEnJ/SyVERE9H5vl0t9+9DW1pZVqVJF1q5dO9mqVatkWVlZRX7e3bt3ZV5eXrIqVarItLS0ZNWqVZN16dJFFhwcLD+nJMvNviWVSmWWlpYyALIFCxYU+T3z8/NlS5YskdWqVUumo6MjMzU1lTVo0EA2b9482fPnz+XnAZCNGzeuyK+xadMmmYODg0xHR0dWs2ZNWUBAgGzOnDmy/75svf0a27Ztk5/v5uam8LP8+/r9e0nYtz97hw4dZMbGxjJdXV1ZjRo1ZEOHDlVYvvZ9ilpu9q2LFy/KAMjat29f5OdaW1vLOnfuLAsNDZXVqVNH/vPu3btX6dwXL17IZs6cKbO3t5dpa2vLKlWqJGvatKnsl19+keXn58tkMpksODhY1r59e5mZmZlMW1tbZmVlJRs9erTs8ePHJf55iIg+lkQmk8kEqmmIiIi+aDExMahXrx62bNki38Tw32xsbFC7dm388ccfAqQjIvq0OMeCiIhIRTZs2AADAwP07NlT6ChERCrHORZERESf2JEjR3Djxg38/vvvGD9+PMqXLy90JCIilWNhQURE9IlNmDABqamp8PT0fOckciKiLwnnWBARERERUalxjgUREREREZUaCwsiIiIiIio1QedYnDp1Cj///DMuX76Mx48f48CBA+jevbv8uEwmw5w5c7BhwwZkZmaiWbNmWLt2LRwcHOTnZGRkYMKECThy5Ag0NDTQq1cvrFq1CgYGBvJzYmNjMW7cOERFRaFy5cqYMGECZsyYoZBl7969mD17NpKSkuDg4IAlS5bA09Pzg7K8j1QqxT///ANDQ0NIJJKPuGJERERERJ+PTCbDixcvULVqVWhovKdPQsA9NGRHjx6VzZo1S7Z//34ZANmBAwcUji9evFhmbGwsO3jwoCwmJkbWrVs3ma2trezly5fyczp27CirW7eu7O+//5adPn1aZm9vL+vfv7/8+PPnz2Xm5uaygQMHyq5duybbuXOnTE9PT7Z+/Xr5OWfPnpVpamrKli5dKrtx44bsxx9/lGlpacni4uI+KMv7PHjwQGGDKz744IMPPvjggw8++FCHx4MHD977Xlc0k7clEolCj4VMJkPVqlUxdepUTJs2DQDw/PlzmJubIzAwEP369cPNmzfh4uKCqKgoNGzYEAAQEhICT09PPHz4EFWrVsXatWsxa9YspKSkQFtbGwDg4+ODgwcPIj4+HgDQt29f5OTkKGxQ9PXXX6NevXpYt25dibKUxPPnz2FiYoIHDx7AyMjok1w3IiIiIiJVycrKgqWlJTIzM2FsbPzOc0W73GxiYiJSUlLQtm1beZuxsTEaN26M8+fPo1+/fjh//jxMTEzkRQUAtG3bFhoaGrhw4QJ69OiB8+fPo2XLlvKiAgA6dOiAJUuW4NmzZzA1NcX58+fh7e2t8P07dOiAgwcPljhLSbwd/mRkZMTCgoiIiIjURkmG8Yu2sEhJSQEAmJubK7Sbm5vLj6WkpMDMzEzheLly5VChQgWFc2xtbZW+xttjpqamSElJee/3eV+WouTl5SEvL0/+PCsr6x0/MRERERGR+hJtYfElWLRokag2RrLx+VPoCIJKWtxZ6AhEREREXyzRLjdbpUoVAEBqaqpCe2pqqvxYlSpVkJaWpnC8oKAAGRkZCucU9TX+/T2KO+ffx9+XpSgzZ87E8+fP5Y8HDx6856cmIiIiIlJPoi0sbG1tUaVKFYSFhcnbsrKycOHCBTRp0gQA0KRJE2RmZuLy5cvyc8LDwyGVStG4cWP5OadOncLr16/l55w4cQJOTk4wNTWVn/Pv7/P2nLffpyRZiqKjoyOfT8F5FURERET0JRO0sMjOzsbVq1dx9epVAG8mSV+9ehXJycmQSCSYPHkyFixYgMOHDyMuLg5eXl6oWrWqfOUoZ2dndOzYEaNGjcLFixdx9uxZjB8/Hv369UPVqlUBAAMGDIC2tjZGjBiB69evY/fu3Vi1apXCZO1JkyYhJCQEy5YtQ3x8PObOnYtLly5h/PjxAFCiLEREREREZZmgcywuXboEDw8P+fO3b/aHDBmCwMBAzJgxAzk5Ofjuu++QmZmJ5s2bIyQkBLq6uvLP2b59O8aPH482bdrIN8hbvXq1/LixsTGOHz+OcePGoUGDBqhUqRJ8fX3x3Xffyc9p2rQpduzYgR9//BH/+9//4ODggIMHD6J27dryc0qShYiIiIiorBLNPhZlQVZWFoyNjfH8+XNBhkVx8jYnbxMRERF9iA95/yraORZERERERKQ+WFgQEREREVGpsbAgIiIiIqJSY2FBRERERESlxsKCiIiIiIhKjYUFERERERGV2kcVFnZ2dkhPT1dqz8zMhJ2dXalDERERERGRevmowiIpKQmFhYVK7Xl5eXj06FGpQxERERERkXr5oJ23Dx8+LP84NDQUxsbG8ueFhYUICwuDjY3NJwtHRERERETq4YMKi+7duwMAJBIJhgwZonBMS0sLNjY2WLZs2ScLR0RERERE6uGDCgupVAoAsLW1RVRUFCpVqqSSUEREREREpF4+qLB4KzEx8VPnICIiIiIiNfZRhQUAhIWFISwsDGlpafKejLc2b95c6mBERERERKQ+PqqwmDdvHvz8/NCwYUNYWFhAIpF86lxERERERKRGPqqwWLduHQIDAzF48OBPnYeIiIiIiNTQR+1jkZ+fj6ZNm37qLEREREREpKY+qrAYOXIkduzY8amzEBERERGRmvqooVCvXr3C77//jpMnT6JOnTrQ0tJSOL58+fJPEo6IiIiIiNTDRxUWsbGxqFevHgDg2rVrCsc4kZuIiIiIqOz5qMIiIiLiU+cgIiIiIiI19lFzLIiIiIiIiP7to3osPDw83jnkKTw8/KMDERERERGR+vmowuLt/Iq3Xr9+jatXr+LatWsYMmTIp8hFRERERERq5KMKixUrVhTZPnfuXGRnZ5cqEBERERERqZ9POsdi0KBB2Lx586f8kkREREREpAY+aWFx/vx56OrqfsovSUREREREauCjhkL17NlT4blMJsPjx49x6dIlzJ49+5MEIyIiIiIi9fFRhYWxsbHCcw0NDTg5OcHPzw/t27f/JMGIiIiIiEh9fFRhERAQ8KlzEBERERGRGvuowuKty5cv4+bNmwCAWrVqwc3N7ZOEIiIiIiIi9fJRhUVaWhr69euHv/76CyYmJgCAzMxMeHh4YNeuXahcufKnzEhERERERCL3UatCTZgwAS9evMD169eRkZGBjIwMXLt2DVlZWZg4ceKnzkhERERERCL3UT0WISEhOHnyJJydneVtLi4u8Pf35+RtIiIiIqIy6KN6LKRSKbS0tJTatbS0IJVKSx2KiIiIiIjUy0cVFq1bt8akSZPwzz//yNsePXqEKVOmoE2bNp8sHBERERERqYePKizWrFmDrKws2NjYoEaNGqhRowZsbW2RlZWFX3/99VNnJCIiIiIikfuoORaWlpaIjo7GyZMnER8fDwBwdnZG27ZtP2k4IiIiIiJSDx/UYxEeHg4XFxdkZWVBIpGgXbt2mDBhAiZMmICvvvoKtWrVwunTp1WVlYiIiIiIROqDCouVK1di1KhRMDIyUjpmbGyM0aNHY/ny5Z8sHBERERERqYcPKixiYmLQsWPHYo+3b98ely9fLnUoIiIiIiJSLx9UWKSmpha5zOxb5cqVw5MnT0odioiIiIiI1MsHFRbVqlXDtWvXij0eGxsLCwuLUociIiIiIiL18kGFhaenJ2bPno1Xr14pHXv58iXmzJmDLl26fLJwRERERESkHj5oudkff/wR+/fvh6OjI8aPHw8nJycAQHx8PPz9/VFYWIhZs2apJCgREREREYnXBxUW5ubmOHfuHL7//nvMnDkTMpkMACCRSNChQwf4+/vD3NxcJUGJiIiIiEi8PniDPGtraxw9ehTPnj3DnTt3IJPJ4ODgAFNTU1XkIyIiIiIiNfBRO28DgKmpKb766qtPmYWIiIiIiNTUB03eJiIiIiIiKgoLCyIiIiIiKrWPHgpFVNbY+PwpdATBJS3uXKrPL+vXsLTXj4iISMzYY0FERERERKXGwoKIiIiIiEqNhQUREREREZUa51h8IH9/f/z8889ISUlB3bp18euvv6JRo0ZCxyKiMoBzVDhHhYhIzNhj8QF2794Nb29vzJkzB9HR0ahbty46dOiAtLQ0oaMREREREQmKhcUHWL58OUaNGoVhw4bBxcUF69atg76+PjZv3ix0NCIiIiIiQXEoVAnl5+fj8uXLmDlzprxNQ0MDbdu2xfnz5wVMRkREJcGhZFwuujQ4FI/o/VhYlNDTp09RWFgIc3NzhXZzc3PEx8cX+Tl5eXnIy8uTP3/+/DkAICsrS3VB30GalyvI9xWL0l73sn79AF7D0uL1Kx1ev9Lh9SsdoV67iYT29ndfJpO991wWFiq0aNEizJs3T6nd0tJSgDRkvFLoBOqP17B0eP1Kh9evdHj9SofXj8q6Fy9ewNjY+J3nsLAooUqVKkFTUxOpqakK7ampqahSpUqRnzNz5kx4e3vLn0ulUmRkZKBixYqQSCQqzSs2WVlZsLS0xIMHD2BkZCR0HLXD61d6vIalw+tXOrx+pcPrVzq8fqVXlq+hTCbDixcvULVq1feey8KihLS1tdGgQQOEhYWhe/fuAN4UCmFhYRg/fnyRn6OjowMdHR2FNhMTExUnFTcjI6My9wf5KfH6lR6vYenw+pUOr1/p8PqVDq9f6ZXVa/i+noq3WFh8AG9vbwwZMgQNGzZEo0aNsHLlSuTk5GDYsGFCRyMiIiIiEhQLiw/Qt29fPHnyBL6+vkhJSUG9evUQEhKiNKGbiIiIiKisYWHxgcaPH1/s0Ccqno6ODubMmaM0NIxKhtev9HgNS4fXr3R4/UqH1690eP1Kj9ewZCSykqwdRURERERE9A7ceZuIiIiIiEqNhQUREREREZUaCwsiIiIiIio1FhZERERERFRqLCxIJUJCQnDmzBn5c39/f9SrVw8DBgzAs2fPBEym3g4dOoQtW7YIHUP0CgoKsGXLFqSmpgodRW09ePAADx8+lD+/ePEiJk+ejN9//13AVOqjdevWyMzMVGrPyspC69atP38gNZSZmYnjx49j27Zt2LJli8KD3q1Xr15YsmSJUvvSpUvx7bffCpBI/URHRyMuLk7+/NChQ+jevTv+97//IT8/X8Bk4sZVoUglXF1dsWTJEnh6eiIuLg5fffUVvL29ERERgZo1ayIgIEDoiGqpZs2auH37NgoLC4WOInr6+vq4efMmrK2thY6illq0aIHvvvsOgwcPRkpKCpycnFCrVi3cvn0bEyZMgK+vr9ARRU1DQwMpKSkwMzNTaE9LS0O1atXw+vVrgZKphyNHjmDgwIHIzs6GkZERJBKJ/JhEIkFGRoaA6cSvcuXKCA8Ph6urq0J7XFwc2rZty5suJfDVV1/Bx8cHvXr1wr1791CrVi306NEDUVFR6Ny5M1auXCl0RFHiPhakEomJiXBxcQEA7Nu3D126dMHChQsRHR0NT09PgdOpr/j4eKEjqI1GjRrh6tWrLCw+0rVr19CoUSMAwJ49e1C7dm2cPXsWx48fx5gxY1hYFCM2Nlb+8Y0bN5CSkiJ/XlhYiJCQEFSrVk2IaGpl6tSpGD58OBYuXAh9fX2h46id7OxsaGtrK7VraWkhKytLgETqJyEhAfXq1QMA7N27Fy1btsSOHTtw9uxZ9OvXj4VFMVhYkEpoa2sjNzcXAHDy5El4eXkBACpUqMD/1OizGDt2LLy9vfHgwQM0aNAA5cuXVzhep04dgZKph9evX8s3gjp58iS6desG4E2v2ePHj4WMJmr16tWDRCKBRCIpcsiTnp4efv31VwGSqZdHjx5h4sSJLCo+kqurK3bv3q10A2DXrl3ym370bjKZDFKpFMCb/wO7dOkCALC0tMTTp0+FjCZqLCxIJZo3bw5vb280a9YMFy9exO7duwG8uQNQvXp1gdOJX0BAAAwMDJTGwu7duxe5ubkYMmSIQMnUR79+/QAAEydOlLdJJBLIZDJIJBIOJ3uPWrVqYd26dejcuTNOnDiB+fPnAwD++ecfVKxYUeB04pWYmAiZTAY7OztcvHgRlStXlh/T1taGmZkZNDU1BUyoHjp06IBLly7Bzs5O6Chqafbs2ejZsyfu3r0rL3DDwsKwc+dO7N27V+B06qFhw4ZYsGAB2rZti8jISKxduxbAm79xc3NzgdOJFwsLUok1a9Zg7NixCA4Oxtq1a+Vd/8eOHUPHjh0FTid+ixYtwvr165XazczM8N1337GwKIHExEShI6i1JUuWoEePHvj5558xZMgQ1K1bFwBw+PBh+RApUvZ26N3bO530cTp37ozp06fjxo0bcHV1hZaWlsLxtz1oVLSuXbvi4MGDWLhwIYKDg6Gnp4c6derg5MmTcHd3FzqeWlixYgUGDRqEgwcPYtasWbC3twcABAcHo2nTpgKnEy9O3iYSIV1dXcTHx8PGxkahPSkpCc7Oznj58qUwwahMKSwsRFZWFkxNTeVtSUlJ0NfXV5qUTMru3r2LlStX4ubNmwAAFxcXTJo0CTVq1BA4mfhpaBS/aCV7HElIr169Qrly5VCuHO/NF4XLzZJKaGpqIi0tTak9PT2dwwBKwMzMTGES6FsxMTEchvIBtm7dimbNmqFq1aq4f/8+AGDlypU4dOiQwMnUg0wmw+XLl7F+/Xq8ePECwJvhPBz3/n6hoaFwcXHBxYsXUadOHdSpUwcXLlxArVq1cOLECaHjiZ5UKi32waLi/ezs7JCenq7UnpmZyeFlJVTcNXz16hUcHR0FSKQeWFiQShTXEZaXl1fkShWkqH///pg4cSIiIiJQWFiIwsJChIeHY9KkSfK5A/Rua9euhbe3Nzw9PZGZmSl/M2JiYsLVPErg/v37cHV1xTfffINx48bhyZMnAN4MkZo2bZrA6cTPx8cHU6ZMwYULF7B8+XIsX74cFy5cwOTJk/HDDz8IHY++cElJSUUWYHl5eXj06JEAidTPu67hv/f4IUXsx6FPavXq1QDedFVv3LgRBgYG8mOFhYU4deoUatasKVQ8tTF//nwkJSWhTZs28u5WqVQKLy8v/PTTTwKnUw+//vorNmzYgO7du2Px4sXy9oYNG/KNcQlMmjQJDRs2VOol69GjB0aNGiVgMvVw8+ZN7NmzR6l9+PDhLGxLKCcnB5GRkUhOTlbakOzfizLQ/zt8+LD849DQUBgbG8ufFxYWIiwsTGmILSkqyTW0tbUVIppaYGFBn9SKFSsAvOmxWLduncKwJ21tbdjY2GDdunVCxVMb2tra2L17NxYsWICrV69CT08Prq6u3JPhAyQmJsLNzU2pXUdHBzk5OQIkUi+nT5/GuXPnlHoYbWxseMezBCpXroyrV6/CwcFBof3q1aucn1ICV65cgaenJ3Jzc5GTk4MKFSrg6dOn8vk9LCyK1r17dwBvbu79d5EPLS0t2NjYYNmyZQIkUx+8hqXDwoI+qbcr8Xh4eODAgQMwMTERNpCa8vPzw7Rp0+Dg4KDwxuTly5f4+eefuTlZCdja2ha5QV5ISAicnZ0FSqU+ihvL/vDhQxgaGgqQSL2MGjUK3333He7duydfQebs2bNYsmQJvL29BU4nflOmTEHXrl2xbt06GBsb4++//4aWlhYGDRqESZMmCR1PtN6uRmZra4uoqChUqlRJ4ETqh9ewdLgqFH1yr1+/Rs2aNfHHH3/wDdxH0tTUxOPHj5XubKanp8PMzIyTF0tg48aNmDt3LpYtW4YRI0Zg48aNuHv3LhYtWoSNGzdyrsp79O3bF8bGxvj9999haGiI2NhYVK5cGd988w2srKwQEBAgdERRk8lkWLlyJZYtW4Z//vkHAFC1alVMnz4dEydOhEQiETihuJmYmODChQtwcnKCiYkJzp8/D2dnZ1y4cAFDhgxBfHy80BHVxqtXr6Crqyt0DCoj2GNBn5yWlhZevXoldAy19nYTt/+KiYlBhQoVBEikfkaOHAk9PT38+OOPyM3NxYABA1C1alWsWrWKRUUJLFu2DB06dICLiwtevXqFAQMG4Pbt26hUqRJ27twpdDzRk0gkmDJlCqZMmSJfUYs9PSWnpaUlX3LWzMwMycnJcHZ2hrGxMR48eCBwOvGTSqX46aefsG7dOqSmpiIhIQF2dnaYPXs2bGxsMGLECKEjqoWwsDCEhYUhLS1NaW+azZs3C5RK3FhYkEqMGzcOS5YswcaNG7nW8wcwNTWFRCKBRCKBo6OjQnFRWFiI7OxsjBkzRsCE6mXgwIEYOHAgcnNzkZ2dzbHtH6B69eqIiYnBrl27EBsbi+zsbIwYMQIDBw6Enp6e0PHUCguKD+fm5oaoqCg4ODjA3d0dvr6+ePr0KbZu3YratWsLHU/0FixYgKCgICxdulRhsYXatWtj5cqVLCxKYN68efDz80PDhg1hYWHBXsYS4lAoUokePXogLCwMBgYGcHV1Rfny5RWO79+/X6Bk4hYUFASZTCZfOebfq1G8nfzepEkTARMSUXHc3NxK/OYjOjpaxWnU26VLl/DixQt4eHggLS0NXl5eOHfuHBwcHLB582b5TvBUNHt7e6xfvx5t2rSBoaEhYmJiYGdnh/j4eDRp0gTPnj0TOqLoWVhYYOnSpRg8eLDQUdQKbyWTSpiYmKBXr15Cx1A7b1egsLW1RbNmzdjbUwqpqamYNm2avBv7v/dQOE/l/W7duoVff/1VvnO0s7Mzxo8fzyWji/F2NRkqvYYNG8o/NjMzQ0hIiIBp1M+jR49gb2+v1C6VSvH69WsBEqmf/Px8+cILVHJ810IqwYmdpePu7o67d+8iICAAd+/exapVq2BmZoZjx47BysoKtWrVEjqi6A0dOhTJycmYPXs2u7E/wr59+9CvXz80bNhQ3kv2999/w9XVFbt27eKNgyLMmTNH6AhEAAAXFxecPn1aaVW84ODgIpfhJmUjR47Ejh07MHv2bKGjqBUWFqRST548wa1btwAATk5OqFy5ssCJ1ENkZCQ6deqEZs2a4dSpU/jpp59gZmaGmJgYbNq0CcHBwUJHFL0zZ87g9OnTqFevntBR1NKMGTMwc+ZM+Pn5KbTPmTMHM2bMYGFBnxyHkn06vr6+GDJkCB49egSpVIr9+/fj1q1b2LJlC/744w+h46mFV69e4ffff8fJkydRp04daGlpKRxfvny5QMnEjYUFqUROTg4mTJiALVu2yFdS0NTUhJeXF3799Vfo6+sLnFDcfHx8sGDBAnh7eytM/GzdujXWrFkjYDL1YWlpqTT8iUru8ePH8PLyUmofNGgQfv75ZwESiV+FChWQkJCASpUqyRdiKE5GRsZnTKYeOJTs0/nmm29w5MgR+Pn5oXz58vD19UX9+vVx5MgRtGvXTuh4aiE2NlZ+Y+ratWsKx9gDXjwWFqQS3t7eiIyMxJEjR9CsWTMAb+4gT5w4EVOnTsXatWsFTihucXFx2LFjh1K7mZkZnj59KkAi9bNy5Ur4+Phg/fr1sLGxETqO2mnVqhVOnz6tNE77zJkzaNGihUCpxG3FihXyGwErVqzgm48PxKFkn1aLFi1w4sQJoWOorYiICKEjqCWuCkUqUalSJQQHB6NVq1YK7REREejTpw+ePHkiTDA1Ub16dezZswdNmzZVWNHjwIEDmDZtGu7evSt0RFH6713inJwcFBQUQF9fX6kbm3eM323dunXw9fVFnz598PXXXwN4M8di7969mDdvHqpWrSo/t1u3bkLFFK28vDwUFBQorYhH77d7924cPnwY+fn5aNOmDZfYJkHduXMHd+/eRcuWLaGnp1fsPlP0BgsLUgl9fX1cvnxZaeft69evo1GjRsjJyREomXqYNm0aLly4gL1798LR0RHR0dFITU2Fl5cXvLy8eGevGEFBQSU+9+0KXFS0t5uTvY9EIuEKW//y5MkTeHl54eTJk5BKpfjqq6+wbdu2IlfoIWVr167FuHHj4ODgAD09PcTFxcHb25vD7+izS09PR58+fRAREQGJRILbt2/Dzs4Ow4cPh6mpKZYtWyZ0RFFiYUEq0aZNG1SsWBFbtmyBrq4uAODly5cYMmQIMjIycPLkSYETilt+fj7GjRuHwMBAFBYWoly5cigsLMSAAQMQGBgITU1NoSMSURGGDx+OY8eOYeLEidDV1cX69ethYWHBYRUlVKtWLfTp00d+82Tbtm0YPXo0b0bRZ+fl5YW0tDRs3LgRzs7O8pEDoaGh8Pb2xvXr14WOKEosLEglrl27hg4dOiAvL0++kVFMTAx0dXURGhrK5VJL6MGDB4iLi0N2djbc3Nzg4OAgdCS1oampicePHyvttp2eng4zMzPeZX+Pe/fuwc7OTugYasfS0hIbN25Ehw4dAAC3b9+Gs7MzcnJyoKOjI3A68dPT08PNmzfl86KkUin09PSQlJQECwsLYcNRmVKlShWEhoaibt26CkOS7927hzp16iA7O1voiKLEydukErVr18bt27exfft2xMfHAwD69++PgQMHQk9PT+B06sPS0hKWlpZCx1BLxd0zycvLg7a29mdOo37s7e3h7u6OESNGoHfv3vKeR3q3f/75R2FXaAcHB+jo6ODx48dcRKAE8vLyFOalaGhoQFtbGy9fvhQwFZVFOTk5Ra5gmZGRwZsE78DCglRGX18fo0aNEjoGlTGrV68G8Gbs/8aNG2FgYCA/VlhYiFOnTnHn6BKIjo5GQEAAvL29MX78ePTt2xcjRoxAo0aNhI4mev8dqqipqcmljz/A7NmzFd7Q5efn46effoKxsbG8jXsIfDw/Pz94eHhwdbf3aNGiBbZs2YL58+cDePOaIpVKsXTpUnh4eAicTrw4FIpU5tatW/j1119x8+ZNAICzszPGjx/PN3WkUra2tgCA+/fvo3r16gpv8rS1tWFjYwM/Pz80btxYqIhqpaCgAIcPH0ZgYCBCQkLg6OiI4cOHY/DgwdzwsggaGhowNjZWWDUmMzMTRkZGChPiuSpZ0Vq1avXeFXckEgnCw8M/U6Ivj62tLVJTU9GmTRscOXJE6Diide3aNbRp0wb169dHeHg4unXrhuvXryMjIwNnz55FjRo1hI4oSiwsSCX27duHfv36oWHDhmjSpAmAN0tVRkVFYdeuXdy1l1TOw8MD+/fvR0FBASQSCSpVqiR0JLWWl5eH3377DTNnzkR+fj60tbXRp08fLFmyhGPf/6WkK5NxVTIS0suXLxEREQFPT0+ho4ja8+fPsWbNGsTExCA7Oxv169fHuHHj+H/eO7CwIJWoUaMGBg4cCD8/P4X2OXPmYNu2bdyHgVQqMzMT//vf/7Bnzx48e/YMwJs9Lvr164cFCxbAxMRE2IBq5NKlS9i8eTN27dqF8uXLY8iQIRgxYgQePnyIefPmISsrCxcvXhQ6JhERiQALC1IJfX19xMbGKq3dfvv2bdStWxe5ubkCJVMPISEhMDAwQPPmzQEA/v7+2LBhA1xcXODv7w9TU1OBE4pXRkYGmjRpgkePHmHgwIHyvVRu3LiBHTt2wNLSEufOneM1fI/ly5cjICAAt27dgqenJ0aOHAlPT0+F4TwPHz6EjY0NCgoKBExKRMXJzc1FcnIy8vPzFdrr1KkjUCL18urVK8TGxiItLQ1SqVThGDcGLRonb5NKtGrVCqdPn1YqLM6cOcMJYyUwffp0LFmyBAAQFxeHqVOnwtvbGxEREfD29kZAQIDACcXLz88P2trauHv3LszNzZWOtW/fHn5+flixYoVACdXD2rVrMXz4cAwdOrTYbn8zMzNs2rTpMycjovd58uQJhg0bhmPHjhV5nMttv19ISAi8vLzw9OlTpWPcGLR47LEglVi3bh18fX3Rp08ffP311wDezLHYu3cv5s2bh6pVq8rPZdWvzMDAANeuXYONjQ3mzp2La9euITg4GNHR0fD09ERKSorQEUXLxsYG69evl+8j8F8hISEYM2YMkpKSPm8wNZOUlAQrKyulHbhlMhkePHgAKysrgZIR0fsMHDgQ9+/fx8qVK9GqVSscOHAAqampWLBgAZYtW4bOnTsLHVH0HBwc0L59e/j6+irdpKLisbAglfjvm5HisOovWoUKFXDmzBm4uLigefPm8PLywnfffYekpCS4uLhwKNk76Ojo4O7du6hevXqRxx8+fAh7e3u8evXqMydTL9xgkISUnJwMS0tLpRWiWNiWjIWFBQ4dOoRGjRrByMgIly5dgqOjIw4fPoylS5fizJkzQkcUPSMjI1y5coWrP32gkr37I/pAUqm0RA++OSla8+bN4e3tjfnz5+PixYvyu0sJCQnFvmGmNypVqvTO3ojExERUqFDh8wVSU8Xdc8rOzuZmeaRytra2ePLkiVJ7RkaGfElpKl5OTo78poCpqan8Wrq6uiI6OlrIaGqjd+/e+Ouvv4SOoXY4x4JU7tWrV3wj8oHWrFmDsWPHIjg4GGvXrkW1atUAAMeOHUPHjh0FTiduHTp0wKxZs3DixAmlHbbz8vIwe/ZsXsN38Pb2BvCmN9HX11dho7LCwkJcuHAB9erVEyidenn48CEOHz5c5ORZbvD2bjKZrMj9LFjYloyTkxNu3boFGxsb1K1bF+vXr4eNjQ3WrVvHpVJLaM2aNfj2229x+vRpuLq6QktLS+H4xIkTBUombhwKRSpRWFiIhQsXYt26dUhNTUVCQgLs7Owwe/Zs2NjYYMSIEUJHpC/Uw4cP0bBhQ+jo6GDcuHGoWbMmZDIZbt68id9++w15eXm4dOkSLC0thY4qSm93lI2MjESTJk0UirO3GwxOmzYNDg4OQkVUC2FhYejWrRvs7OwQHx+P2rVrIykpCTKZTL7hFil7W9iuWrUKo0aNKrKw1dTUxNmzZ4WKqBa2bduGgoICDB06FJcvX0bHjh2RkZEBbW1tBAYGom/fvkJHFL1NmzZhzJgx0NXVRcWKFRUKXYlEgnv37gmYTrxYWJBK+Pn5ISgoCH5+fhg1ahSuXbsGOzs77N69GytXrsT58+eFjih6hYWFOHjwoHzn8lq1aqFbt24KO0lT0RITEzF27FgcP35cPqRHIpGgXbt2WLNmjdJqZaRs2LBhWLVqFYyMjISOopYaNWqETp06Yd68eTA0NERMTAzMzMwwcOBAdOzYEd9//73QEUWJha1q5ObmIj4+HlZWVtwstISqVKmCiRMnwsfHp8TzRomFBamIvb091q9fjzZt2shfVN/euWvSpIl80zIq2p07d+Dp6YlHjx7ByckJAHDr1i1YWlrizz//5GSyEnr27Blu374N4M3vJOdWfJyHDx8CAOf3fABDQ0NcvXoVNWrUgKmpKc6cOYNatWohJiYG33zzDVclew8WtqXj5+eHadOmKfT4AG923P7555/h6+srUDL1UaFCBURFRfH19gOxBCOVePToUZF3haVSKV6/fi1AIvUyceJE1KhRAw8ePEB0dDSio6ORnJwMW1tbjuv8AKampmjUqBEaNWrEouIDSaVS+Pn5wdjYGNbW1rC2toaJiQnmz5+vtFEUKStfvrx8XoWFhQXu3r0rP1bUuvikKCAggEVFKcybNw/Z2dlK7bm5uZg3b54AidTPkCFDsHv3bqFjqB1O3iaVcHFxwenTp2Ftba3QHhwcDDc3N4FSqY/IyEj8/fffCm+GK1asiMWLF6NZs2YCJqOyYtasWdi0aZPC79yZM2cwd+5cvHr1Cj/99JPACcXt66+/xpkzZ+Ds7AxPT09MnToVcXFx2L9/v3xvHypeTk4OFi9ejLCwsCJ3Peb49ncrbvJ7TEwMb7KUUGFhIZYuXYrQ0FDUqVNHafI2F2AoGgsLUglfX18MGTIEjx49glQqxf79+3Hr1i1s2bIFf/zxh9DxRE9HRwcvXrxQas/OzlZa6YhIFYKCgrBx40aFDSzr1KmDatWqYezYsSws3mP58uXyO8Zv7x7v3r0bDg4OfENSAiNHjkRkZCQGDx4MCwuLIt8kkzJTU1NIJBJIJBI4OjoqXLfCwkJkZ2djzJgxAiZUH3FxcfIbodeuXVM4xt/H4nGOBanM6dOn4efnh5iYGGRnZ6N+/frw9fVF+/bthY4mel5eXoiOjsamTZvQqFEjAMCFCxcwatQoNGjQAIGBgcIGpC+erq4uYmNj4ejoqNB+69Yt1KtXDy9fvhQoGZUFJiYm+PPPP9lD+4GCgoIgk8kwfPhwrFy5EsbGxvJjbye/N2nSRMCE9KVjYUGfXEFBARYuXIjhw4dzsudHyszMxJAhQ3DkyBF592tBQQG6deuGwMBAhRcLIlVo3LgxGjdujNWrVyu0T5gwAVFRUfj7778FSqYe7OzsEBUVhYoVKyq0Z2Zmon79+hzK8x62trY4evQonJ2dhY6iliIjI9G0aVOl4Tv04e7cuYO7d++iZcuW0NPTK3aYGb3BwoJUwsDAANeuXYONjY3QUdTa7du3ER8fDwBwdnbmMqn02URGRqJz586wsrKS3+E8f/48Hjx4gKNHj6JFixYCJxQ3DQ0NpKSkyHc/fis1NRVWVlbIy8sTKJl62LZtGw4dOoSgoCCllY3ow7x69Uppg0ZOjH+/9PR09OnTBxEREZBIJLh9+zbs7OwwfPhwmJqaYtmyZUJHFCXOsSCVaNOmDSIjI1lYlJK1tTWkUilq1KiBcuX450qfj7u7OxISEuDv7y8vbnv27ImxY8eiatWqAqcTr8OHD8s/Dg0NVehdLCwsRFhYGP9fLIFly5bh7t27MDc3h42NjdKd9+joaIGSqYfc3FzMmDEDe/bsQXp6utLxwsJCAVKplylTpkBLSwvJyckKPWd9+/aFt7c3C4ti8J0KqUSnTp3g4+ODuLg4NGjQAOXLl1c4/u8JoaQsNzcXEyZMQFBQEADIdy6fMGECqlWrBh8fH4ET0pcuOTkZlpaWRU7STk5OhpWVlQCpxK979+4A3kzuHDJkiMIxLS0t2NjY8A1JCby9jvRxpk+fjoiICKxduxaDBw+Gv78/Hj16hPXr12Px4sVCx1MLx48fR2hoqNKQbgcHB9y/f1+gVOLHoVCkEu/apVIikfBuyXtMmjQJZ8+excqVK9GxY0fExsbCzs4Ohw4dwty5c3HlyhWhI9IXTlNTE48fP1YaypOeng4zMzP+Db+Hra0toqKiuMsxCcLKygpbtmxBq1atYGRkhOjoaNjb22Pr1q3YuXMnjh49KnRE0TM0NER0dDQcHBwUNvq9dOkSOnToUGRPEHGDPFIRqVRa7INvSN7v4MGDWLNmDZo3b64wSaxWrVoKG20RqUpxExSzs7Ohq6srQCL1kpiYyKKilDIzM7Fx40bMnDkTGRkZAN4MgXr06JHAycQvIyMDdnZ2AN7Mp3h7/Zo3b45Tp04JGU1ttGjRAlu2bJE/l0gkkEqlWLp0KTw8PARMJm4cCkUkQk+ePFG6Uwy82TSKq1GQKnl7ewN48yI6e/ZshYmzhYWFuHDhAurVqydQOvWSk5ODyMhIJCcnK02enThxokCp1ENsbCzatm0LY2NjJCUlYdSoUahQoQL279+P5ORkhTd8pMzOzg6JiYmwsrJCzZo1sWfPHjRq1AhHjhyBiYmJ0PHUwtKlS9GmTRtcunQJ+fn5mDFjBq5fv46MjAycPXtW6HiixcKCSIQaNmyIP//8ExMmTADw/5vxbNy4kWuQk0q9HWYnk8kQFxensCGjtrY26tati2nTpgkVT21cuXIFnp6eyM3NRU5ODipUqICnT59CX18fZmZmLCzew9vbG0OHDsXSpUthaGgob/f09MSAAQMETKYehg0bhpiYGLi7u8PHxwddu3bFmjVr8Pr1a27QWEK1a9dGQkIC1qxZA0NDQ2RnZ6Nnz54YN24cLCwshI4nWpxjQSRCZ86cQadOnTBo0CAEBgZi9OjRuHHjBs6dO4fIyEg0aNBA6Ij0hRs2bBhWrVrFZSk/UqtWreDo6Ih169bB2NgYMTEx0NLSwqBBgzBp0iT07NlT6IiiZmxsjOjoaNSoUUNhfPv9+/fh5OSEV69eCR1Rrdy/fx+XL1+Gvb096tSpI3Qc0Xv9+jU6duyIdevWwcHBQeg4aoVzLIhEqHnz5rh69SoKCgrg6uqK48ePw8zMDOfPn2dRQZ9FQECAvKh4+PAhHj58KHAi9XL16lVMnToVGhoa0NTURF5eHiwtLbF06VL873//Ezqe6Ono6CArK0upPSEhAZUrVxYgkXqztrZGz549WVSUkJaWFmJjY4WOoZY4FIpIpGrUqIENGzYIHYPKKKlUigULFmDZsmXIzs4G8GaVlKlTp2LWrFnvXPmN3rwxeXuNzMzM5GvhGxsb48GDBwKnE79u3brBz88Pe/bsAfBmOGhycjJ++OEH9OrVS+B04rR69eoSn8uheO83aNAgbNq0icvzfiAWFkQiUdTdueJweAqp2qxZs+Qvqs2aNQPwZoje3Llz8erVqyL3t6D/5+bmhqioKDg4OMDd3R2+vr54+vQptm7ditq1awsdT/SWLVuG3r17w8zMDC9fvoS7uztSUlLQpEkT/u4VY8WKFQrPnzx5gtzcXPlk7czMTM7x+QAFBQXYvHkzTp48WeR+XJyrUjTOsaDPTkNDA61atcLPP//MYT3/oqGhUeIVn7hkL6la1apVsW7dOqXNLA8dOoSxY8dyyc/3uHTpEl68eAEPDw+kpaXBy8sL586dg4ODAzZv3oy6desKHVEtnDlzBrGxscjOzkb9+vXRtm1boSOphR07duC3337Dpk2b4OTkBAC4desWRo0ahdGjR2PgwIECJxS/dy0pK5FIEB4e/hnTqA8WFvTZBQYGIikpCSEhIfj777+FjiMakZGR8o+TkpLg4+ODoUOHyleBOn/+PIKCgrBo0SKlHX2JPjVdXV3ExsbC0dFRof3WrVuoV68eXr58KVAyInqfGjVqIDg4GG5ubgrtly9fRu/evZGYmChQMvrSsbAgEqE2bdpg5MiR6N+/v0L7jh078Pvvv+Ovv/4SJhiVGY0bN0bjxo2Vxm1PmDABUVFRvClQAk+fPkVSUhIkEglsbGxQsWJFoSOplaioKERERCAtLQ1SqVThGIehvJu+vj4iIyPx1VdfKbRfvHgRrVq1Qm5urkDJ6EvHwoI+i6ysLISHh8PJyQnOzs5CxxE9fX19xMTEKC1zl5CQgHr16vFFgVQuMjISnTt3hpWVlUKv2YMHD3D06FG0aNFC4ITidf36dXz//fdKm2i5u7vjt99+Q82aNQVKpj4WLlyIH3/8EU5OTjA3N1cYJsphKO/XtWtXPHr0CBs3bkT9+vUBvOmt+O6771CtWjUcPnxY4ITi1LNnTwQGBsLIyOi9S0Lv37//M6VSL5y8TSrRp08ftGzZEuPHj8fLly/RsGFDJCUlQSaTYdeuXVzV4z0sLS2xYcMGLF26VKF948aNsLS0FCgVlSXu7u5ISEiAv78/4uPjAbx50R07diyqVq0qcDrxSklJgbu7OypXrozly5ejZs2akMlkuHHjBjZs2ICWLVvi2rVrMDMzEzqqqK1atQqbN2/G0KFDhY6iljZv3owhQ4agYcOG0NLSAvBmMnKHDh2wceNGgdOJl7GxsbyINTY2FjiNemKPBalElSpVEBoairp162LHjh2YM2cOYmJiEBQUhN9//12+uy8V7ejRo+jVqxfs7e3RuHFjAG+6sG/fvo19+/bB09NT4IT0JePmUB/vhx9+wMmTJ3H27Fno6uoqHHv58iWaN2+O9u3bY9GiRQIlVA8WFhY4deoUf/9KKSEhATdv3oREIkHNmjWV5kwRfWpciJxU4vnz56hQoQIAICQkBL169YK+vj46d+6M27dvC5xO/Dw9PZGQkICuXbsiIyMDGRkZ6Nq1KxISElhUkMpxc6iPd+LECfzwww9KRQUA6OnpYfr06QgNDRUgmXqZMmUK/P39hY6h9hwdHdGtWzd07dqVRQV9FuyxIJVwdHTEggUL0LlzZ9ja2mLXrl1o3bo1YmJi0KZNGzx9+lToiET0DlOmTIGOjg43h/pAJiYmuHTpEuzt7Ys8fufOHTRs2BCZmZmfN5iakUql6Ny5MxISEuDi4iIfzvMWx7e/35YtW/Dzzz/Lb+Y5Ojpi+vTpGDx4sMDJ1IOtre07l4C/d+/eZ0yjPjjHglRi8uTJGDhwIAwMDGBtbY1WrVoBAE6dOgVXV1dhw4lUbGwsateuDQ0NjffeLa5Tp85nSkVlFTeH+jgvXrx45waWhoaG8p3MqXgTJ05EREQEPDw8ULFixRLv8UNvLF++HLNnz8b48eMVNrgcM2YMnj59iilTpgicUPwmT56s8Pz169e4cuUKQkJCMH36dGFCqQH2WJDKXL58GcnJyWjXrh0MDAwAAH/++SdMTU3RtGlTgdOJj4aGBlJSUmBmZibfLK+oP0+JRMIN8kjluDnUx9HU1ERCQgIqV65c5PHU1FTUrFmTf8PvYWhoiF27dqFz585CR1FLtra2mDdvHry8vBTag4KCMHfuXO5jUQr+/v64dOkSAgIChI4iSiwsSCX8/Pwwbdo06OvrK7S/fPkSP//8M3x9fQVKJl7379+HlZUVJBIJ7t+//85zra2tP1MqIvoQb28KFEcmk/HmQAlYW1sjNDSUS/N+JF1dXVy7dk1pSN7t27fh6uqKV69eCZRM/d27dw/16tVDVlaW0FFEiYUFqYSmpiYeP36stKRieno6zMzM+KL6Hq9evSpy8ieREB48eAAAXOq4BCIjI0t0nru7u4qTqLeAgACEhIQgICBA6QYVvV/t2rUxYMAA/O9//1NoX7BgAXbv3o24uDiBkqm/pUuX4rfffkNSUpLQUUSJcyxIJd7elfuvmJgY+WpRVDwzMzP06NEDgwYNQps2baChwQXc6PMqKCjAvHnzsHr1avmcAAMDA0yYMAFz5sxRmkxLb7Bg+DRWr16Nu3fvwtzcHDY2Nkq/b9HR0QIlUw/z5s1D3759cerUKfkci7NnzyIsLAx79uwROJ16cHNzU3gfI5PJkJKSgidPnuC3334TMJm4sbCgT8rU1BQSiQQSiQSOjo4Kf5SFhYXIzs7GmDFjBEyoHoKCgrBjxw588803MDY2Rt++fTFo0CA0bNhQ6GhURkyYMAH79+/H0qVLFXbenjt3LtLT07F27VqBE9KXrHv37kJHUGu9evXCxYsXsXz5chw8eBAA4OzsjIsXL8LNzU3YcGriv7+DGhoaqFy5Mlq1asUheu/AoVD0SQUFBUEmk2H48OFYuXKlws6V2trasLGxkb9Jofd78eIFgoODsXPnToSHh8POzg6DBg3iHBVSOWNjY+zatQudOnVSaD969Cj69++P58+fC5SMiN7l9evXGD16NGbPng1bW1uh41AZw8KCVCIyMhJNmzblcIlP6MaNGxg4cCBiY2M5R4VUzszMDJGRkXB2dlZov3nzJlq2bIknT54IlIzKiszMTAQHB+Pu3buYPn06KlSogOjoaJibm6NatWpCxxM1Y2NjXL16lYVFKXzI5Ox3LTFd1rCwIJWRSqW4c+cO0tLSIJVKFY61bNlSoFTq5dWrVzh8+DB27NiBkJAQmJubo3///ty0jFTOz88P8fHxCAgIgI6ODgAgLy8PI0aMgIODA+bMmSNwQvqSxcbGom3btjA2NkZSUhJu3boFOzs7/Pjjj0hOTsaWLVuEjihqQ4YMQb169bhfRSm8b4U3gKu8FYVzLEgl/v77bwwYMAD3799X2ouBf4TvFxoaih07duDgwYMoV64cevfujePHj7Mgo8/mypUrCAsLQ/Xq1VG3bl0AbxZfyM/PR5s2bdCzZ0/5udwFmT41b29vDB06FEuXLoWhoaG83dPTEwMGDBAwmXpwcHCAn58fzp49W+QGlxMnThQomfoICAiAj48Phg4dqjDPLCgoCIsWLYKNjY2wAUWKPRakEvXq1YOjoyPmzZsHCwsLpar/33MvSJm+vj66dOmCgQMHwtPTk0PK6LMbNmxYic/lRlHKcnJysHjxYoSFhRXZa3vv3j2BkqkHY2NjREdHo0aNGjA0NERMTAzs7Oxw//59ODk5cR+G93jXECiJRMLfvxJo06YNRo4cif79+yu079ixA7///jv++usvYYKJHHssSCVu376N4OBgpc15qGRSU1MV7tIRfW4sFkpn5MiRiIyMxODBg4u8uULvpqOjU+QY93ftak7/jztrl9758+exbt06pfaGDRti5MiRAiRSDywsSCUaN26MO3fusLD4SCwqiNTbsWPH8Oeff8r3EKAP061bN/j5+cn3XJBIJEhOTsYPP/yAXr16CZyOygJLS0ts2LABS5cuVWjfuHEjNwt9BxYWpBITJkzA1KlTkZKSAldXV6WhPHXq1BEoGRGR6pmamnIz0FJYtmwZevfuDTMzM7x8+RLu7u5ISUlBkyZN8NNPPwkdT/R69eqFRo0a4YcfflBoX7p0KaKiorB3716BkqmPFStWoFevXjh27BgaN24MALh48SJu376Nffv2CZxOvDjHglSiqJ2iJRIJV1AgojJh27ZtOHToEIKCgqCvry90HLV15swZxMbGIjs7G/Xr10fbtm2FjqQWKleujPDwcLi6uiq0x8XFoW3btkhNTRUomXp58OAB1q5di/j4eABvNhkcM2YMeyzegYUFqcT9+/ffedza2vozJSEi+jzc3NwU5lLcuXMHMpkMNjY2Sr220dHRnzselSF6enq4evUqnJycFNrj4+Ph5uaGly9fCpSMvnQcCkUqwcKBiMqa7t27Cx3hi7F69eoi2yUSCXR1dWFvb4+WLVtCU1PzMydTD66urti9ezd8fX0V2nft2gUXFxeBUqmf06dPY/369bh37x727t2LatWqYevWrbC1tUXz5s2FjidKLCxIZbZu3Yp169YhMTER58+fh7W1NVauXAlbW1t88803QsdTW8OHD4eHhwcGDx4sdBQqo7Zs2YJmzZqhRo0aQkcRFW4a+OmsWLECT548QW5uLkxNTQEAz549g76+PgwMDJCWlgY7OztERERwWEoRZs+ejZ49e+Lu3bto3bo1ACAsLAw7d+7k/IoS2rdvHwYPHoyBAwciOjoaeXl5AIDnz59j4cKFOHr0qMAJxUl5IDzRJ7B27Vp4e3vD09MTmZmZ8jkVJiYmWLlypbDh1Ny9e/cwe/Zs1KtXT+goVEYNHToULi4umDBhgtBRRCsqKgoXLlxQar9w4QIuXbokQCL1snDhQnz11Ve4ffs20tPTkZ6ejoSEBDRu3BirVq1CcnIyqlSpwp2li9G1a1ccPHgQd+7cwdixYzF16lQ8fPgQJ0+eZM9aCS1YsADr1q3Dhg0bFIYyNmvWjEMZ34FzLEglXFxcsHDhQnTv3l1hc6Nr166hVatWePr0qdAR1d6NGzfYpU2CSUxMxLFjxzB27Fiho4hSo0aNMGPGDPTu3Vuhff/+/ViyZEmRRQf9vxo1amDfvn1KN1CuXLmCXr164d69ezh37hx69eqFx48fCxOSvmj6+vq4ceMGbGxsFN7H3Lt3Dy4uLtyksRjssSCVSExMhJubm1K7jo4OcnJyBEj05WFRQUKytbVlUfEON27cQP369ZXa3dzccOPGDQESqZfHjx+joKBAqb2goAApKSkAgKpVq+LFixefOxqVEVWqVMGdO3eU2s+cOQM7OzsBEqkHFhakEra2trh69apSe0hICJydnT9/IDUTFBSEP//8U/58xowZMDExQdOmTd+74hbRpxAdHY24uDj580OHDqF79+743//+h/z8fAGTqQcdHZ0il/R8/PgxypXj9Mb38fDwwOjRo3HlyhV525UrV/D999/L5wzExcXB1tZWqIiiU6FCBflogLf7qBT3oPcbNWoUJk2ahAsXLkAikeCff/7B9u3bMW3aNHz//fdCxxMt/u9GKuHt7Y1x48bh1atXkMlkuHjxInbu3IlFixZh48aNQscTvYULF2Lt2rUAgPPnz8Pf3x8rVqzAH3/8gSlTpmD//v0CJ6Qv3ejRo+Hj4wNXV1fcu3cP/fr1Q48ePbB3717k5uZyrtR7tG/fHjNnzsShQ4dgbGwMAMjMzMT//vc/tGvXTuB04rdp0yYMHjwYDRo0kI9vLygoQJs2bbBp0yYAgIGBAZYtWyZkTFFZsWIFDA0NAYB/n5+Aj48PpFIp2rRpg9zcXLRs2RI6OjqYNm0a55e9A+dYkMps374dc+fOxd27dwG86baeN28eRowYIXAy8dPX10d8fDysrKzwww8/4PHjx9iyZQuuX7+OVq1a4cmTJ0JHpC+csbExoqOjUaNGDSxZsgTh4eEIDQ3F2bNn0a9fPzx48EDoiKL28OFDuLu7Iz09XT4s9OrVqzA3N8eJEye4klEJxcfHIyEhAQDg5OSktC8Dkarl5+fjzp07yM7OhouLCwwMDPDy5Uvo6ekJHU2U2GNBKjNw4EAMHDgQubm5yM7OhpmZmdCR1IaBgQHS09NhZWWF48ePw9vbGwCgq6vLjY3os5DJZJBKpQCAkydPokuXLgAAS0tLLr5QAtWrV0dsbCy2b9+OmJgY6OnpYdiwYejfv7/SZnlUvJo1a6JmzZpCx1BbaWlpSEtLk/8tv1WnTh2BEqkfbW1t+ZzGvLw8LF++HEuXLpXP9SFFLCxI5fT19aGvry90DLXSrl07jBw5Em5ubkhISICnpycA4Pr167CxsRE2HJUJDRs2xIIFC9C2bVtERkbKh+YlJibC3Nxc4HTi9vr1a9SsWRN//PEHvvvuO6HjqK2HDx/i8OHDSE5OVprXs3z5coFSqYfLly9jyJAhuHnzJv47MEUikciXgCdleXl5mDt3Lk6cOAFtbW3MmDED3bt3R0BAAGbNmgVNTU0uc/wOLCxIJdLT0+Hr64uIiIgi75ZkZGQIlEw9+Pv748cff8SDBw+wb98+VKxYEcCbF4v+/fsLnI7KghUrVmDQoEE4ePAgZs2aBXt7ewBAcHAwmjZtKnA6cdPS0uJSlKUUFhaGbt26wc7ODvHx8ahduzaSkpIgk8mKXG2LFA0fPhyOjo7YtGkTzM3NIZFIhI6kNnx9fbF+/Xq0bdsW586dw7fffothw4bh77//xvLly/Htt99yx/d34BwLUglPT0/cuXMHI0aMKPI/tSFDhgiUjIhK49WrVyhXrhxXNnqPhQsXIiEhARs3buS1+giNGjVCp06dMG/ePPkeAmZmZhg4cCA6duzIVXnew9DQEFeuXJHfEKCSs7Ozw8qVK9GtWzdcu3YNderUwdChQ7Fp0yYWaCXAwoJUwtDQEGfOnEHdunWFjqKWYmNji2yXSCTQ1dWFlZUVdHR0PnMqKkvs7OwQFRUl7y17KzMzE/Xr18e9e/cESqYeevTogbCwMBgYGMDV1RXly5dXOM6V3d7N0NAQV69eRY0aNWBqaoozZ86gVq1aiImJwTfffIOkpCShI4pa9+7dMXjwYPTq1UvoKGpHW1sbiYmJqFatGgBAT08PFy9ehKurq8DJ1ANvo5BK1KxZk5OMS6FevXrvvDOipaWFvn37Yv369dDV1f2MyaisSEpKKnIcdl5eHh4+fChAIvViYmLCN3WlUL58efm8CgsLC9y9exe1atUCAC4eUAIbN27EkCFDcO3aNdSuXVtpwYBu3boJlEz8CgsLoa2tLX9erlw5GBgYCJhIvbCwIJX47bff4OPjA19f3yL/UzMyMhIomXo4cOAAfvjhB0yfPh2NGjUCAFy8eBHLli3DnDlzUFBQAB8fH/z444/45ZdfBE5LX5LDhw/LPw4NDZXvwQC8ecENCwvjpmQlEBAQIHQEtfb111/jzJkzcHZ2hqenJ6ZOnYq4uDjs378fX3/9tdDxRO/8+fM4e/Ysjh07pnSMk7ffTSaTYejQofJRAa9evcKYMWPY61hCHApFKnH79m0MGDAA0dHRCu0ymYz/qZVAo0aNMH/+fHTo0EGhPTQ0FLNnz8bFixdx8OBBTJ06Vb5PCNGnoKGhAeDNm4//vjxoaWnBxsYGy5Ytky8/S6QK9+7dQ3Z2NurUqYOcnBxMnToV586dg4ODA5YvXw5ra2uhI4qajY0NunTpgtmzZ3MVtw80bNiwEp3HmwdFY2FBKtGoUSOUK1cOkyZNKnLytru7u0DJ1IOenh6uXLmitH57fHw83Nzc8PLlSyQlJcHFxQW5ubkCpaQvma2tLaKiolCpUiWho6it4OBg7Nmzp8jlUv9704XoU/r3HBWiz4lDoUglrl27hitXrnCX1I9Us2ZNLF68GL///rt8rOfr16+xePFiebHx6NEj3okilUlMTBQ6glpbvXo1Zs2ahaFDh+LQoUMYNmwY7t69i6ioKIwbN07oeGojPz+/yCXLraysBEqkHnr27ImIiAgWFvTZsbAglWjYsCEePHjAwuIj+fv7o1u3bqhevbp8h9S4uDgUFhbijz/+APBmqMDYsWOFjElfuMjISPzyyy+4efMmAMDFxQXTp09HixYtBE4mfr/99ht+//139O/fH4GBgZgxYwbs7Ozg6+vLfXxKICEhASNGjMC5c+cU2jmctmQcHR0xc+ZMnDlzBq6urkrzHCdOnChQMvrScSgUqcTevXsxd+5cTJ8+vcj/1N6+WabivXjxAtu3b0dCQgIAwMnJCQMGDIChoaHAyags2LZtG4YNG4aePXuiWbNmAICzZ8/iwIEDCAwMxIABAwROKG76+vq4efMmrK2tYWZmhhMnTqBu3bq4ffs2vv76a6SnpwsdUdSaNWuGcuXKwcfHBxYWFkrDabmU+bu9a4EFiUTC5aJJZVhYkEq8nQD6b28ng/JuE5H4OTs747vvvsOUKVMU2pcvX44NGzbIezGoaHZ2dti3bx/c3NzQsGFDjBo1CqNHj8bx48fRr18/9lq8R/ny5XH58mWleWZEJG4cCkUqwfHZpXf79m1EREQUOb7Y19dXoFRUVty7dw9du3ZVau/WrRv+97//CZBIvbRu3RqHDx+Gm5sbhg0bhilTpiA4OBiXLl1Cz549hY4nei4uLtyvgkgNsceCSIQ2bNiA77//HpUqVUKVKlUUhgFIJBKuKEMqZ29vj+nTp2P06NEK7evWrcOyZctw+/ZtgZKpB6lUCqlUinLl3ty/27Vrl3y51NGjRytswEXKwsPD8eOPP2LhwoVFDqflXkikakFBQahUqRI6d+4MAJgxYwZ+//13uLi4YOfOnVzyuBgsLEgltmzZ8s7jXl5enymJerK2tsbYsWPxww8/CB2Fyqi1a9di8uTJGD58OJo2bQrgzRyLwMBArFq1SqngIPqU/r2fyr9xOC19Lk5OTli7di1at26N8+fPo23btlixYgX++OMPlCtXjhvkFYOFBamEqampwvPXr18jNzcX2tra0NfX5/ji9zAyMsLVq1dhZ2cndBQqww4cOIBly5bJ51M4Oztj+vTp+OabbwROph5Onz6N9evX4+7duwgODka1atWwdetW2Nraonnz5kLHE7XIyMh3HudeSKRq+vr6iI+Ph5WVFX744Qc8fvwYW7ZswfXr19GqVSs8efJE6IiixDkWpBLPnj1Tart9+za+//57TJ8+XYBE6uXbb7/F8ePHMWbMGKGjUBnWo0cP9OjRQ+gYamnfvn0YPHgwBg4ciCtXriAvLw8A8Pz5cyxcuBBHjx4VOKG4sXAgoRkYGCA9PR1WVlY4fvw4vL29AQC6urp4+fKlwOnEi4UFfTYODg5YvHgxBg0ahPj4eKHjiJq9vT1mz56Nv//+m2uQE6mhBQsWYN26dfDy8sKuXbvk7c2aNcOCBQsETKZecnNzi9y5nEuWf7zk5GRUq1YNmpqaQkcRtXbt2mHkyJFwc3NDQkICPD09AQDXr1+HjY2NsOFEjIUFfVblypXDP//8I3QM0fv9999hYGCAyMhIpSEBEomEhQWphKmpqdKY9uJwOOO73bp1Cy1btlRqNzY2RmZm5ucPpGaePHmCYcOG4dixY0Ue5xyLj2djYwMHBwcsWrSIK5S9g7+/P3788Uc8ePAA+/btQ8WKFQEAly9fRv/+/QVOJ14sLEglDh8+rPBcJpPh8ePHWLNmjXyzLSoel+slIaxcuVLoCF+MKlWq4M6dO0p3Ns+cOcO5UyUwefJkZGZm4sKFC2jVqhUOHDiA1NRULFiwAMuWLRM6nlqLiIjAvXv3sHv3bhYW72BiYoI1a9Yotc+bN0+ANOqDk7dJJf67QZ5EIkHlypXRunVrLFu2DBYWFgIlIyJSvUWLFmHbtm3YvHkz2rVrh6NHj+L+/fuYMmUKZs+ejQkTJggdUdQsLCxw6NAhNGrUCEZGRrh06RIcHR1x+PBhLF26FGfOnBE6In3hAgICYGBggG+//Vahfe/evcjNzcWQIUMESiZu7LEglfjvhm70ft7e3pg/fz7Kly8vnyRWnOXLl3+mVFSWZGVlyfcHyMrKeue53Efg3Xx8fCCVStGmTRvk5uaiZcuW0NHRwbRp01hUlEBOTg7MzMwAvBmi9+TJEzg6OsLV1ZX7+JTAy5cvIZPJoK+vDwC4f/8+Dhw4ABcXF7Rv317gdOph0aJFWL9+vVK7mZkZvvvuOxYWxWBhQSQSV65cwevXr+UfF6ekY+CJPpSpqSkeP34MMzMzmJiYFPm7xn0ESkYikWDWrFmYPn067ty5g+zsbLi4uMDAwEDoaGrByckJt27dgo2NDerWrYv169fDxsYG69atY493CXzzzTfo2bMnxowZg8zMTDRu3BhaWlp4+vQpli9fju+//17oiKKXnJwMW1tbpXZra2skJycLkEg9sLAglejVqxcaNWqktMHb0qVLERUVhb179wqUTLwiIiKK/JjocwkPD0eFChUA8HfwU9HW1oaLiwuysrJw8uRJODk5wdnZWehYojdp0iQ8fvwYADBnzhx07NgR27dvh7a2NgIDA4UNpwaio6OxYsUKAEBwcDDMzc1x5coV7Nu3D76+viwsSsDMzAyxsbFK86RiYmLkE7lJGedYkEpUrlwZ4eHhcHV1VWiPi4tD27ZtkZqaKlAyIiLV69OnD1q2bInx48fj5cuXqFevHhITEyGTybBr1y706tVL6IhqJTc3V75ZWaVKlYSOI3r/3tytT58+qFWrFubMmYMHDx7AyckJubm5QkcUvR9++AG7d+9GQECAfIW3yMhIDB8+HL1798Yvv/wicEJxYo8FqUR2dja0tbWV2rW0tN47drus+pDVOfbv36/CJERvPHv2DJs2bZLvvO3i4oJhw4bJezWoeKdOncKsWbMAvNnBXCqVIjMzE0FBQViwYAELiw+kr6+P+vXrCx1Dbdjb2+PgwYPo0aMHQkNDMWXKFABAWloa50eV0Pz585GUlIQ2bdqgXLk3b5elUim8vLywcOFCgdOJF3ssSCUaNWqELl26wNfXV6F97ty5OHLkCC5fvixQMvEaNmxYic8NCAhQYRKiN2+Mu3btCmNjYzRs2BDAm/XbMzMzceTIkSL3aKD/p6enh4SEBFhaWsLLywtVq1bF4sWLkZycDBcXF2RnZwsdUZT8/PxKdN5/X1tIUXBwMAYMGIDCwkK0adMGx48fB/BmQvKpU6eK3R+ElCUkJCAmJgZ6enpwdXWFtbW10JFEjYUFqcSRI0fQs2dPDBgwAK1btwYAhIWFYefOndi7dy+6d+8ubEAieidXV1c0adIEa9eule/QW1hYiLFjx+LcuXOIi4sTOKG4OTo6YsGCBejcuTNsbW2xa9cutG7dGjExMWjTpg2ePn0qdERR0tDQQNWqVWFmZobi3p5IJBKuDFUCKSkpePz4MerWrStfAv7ixYswNjaGk5OTwOnoS8XCglTmzz//xMKFC3H16lXo6emhTp06mDNnDtzd3YWORkTvoaenh6tXryq9Abl16xbq1auHly9fCpRMPfz222+YNGkSDAwMYGVlhStXrkBDQwO//vor9u/fz8nxxejcuTPCw8PRoUMHDB8+HF26dFHaF4neb/jw4Vi1ahUMDQ0V2nNycjBhwgRs3rxZoGTixmXfS4+FBZFIuLm5lXgpWd6tI1Vr1qwZpk+frtS7ePDgQSxevBh///23MMHUyKVLl/DgwQO0a9dOvszsn3/+CRMTEzRr1kzgdOL1zz//ICgoCIGBgcjKyoKXlxeGDx/Ou+wfQFNTU7509L89ffoUVapUQUFBgUDJxM3DwwMHDhyAiYkJPDw8ij1PIpEgPDz8MyZTHywsSKUuX74sn/hZq1YtuLm5CZxIvObNm1fic+fMmaPCJETA7t27MWPGDEyYMAFff/01AODvv/+Gv78/Fi9erLBkap06dYSKKXr5+flITExEjRo15BNAqeROnTqFgIAA7Nu3D66urjh58iT09PSEjiVaWVlZkMlkMDU1xe3bt1G5cmX5scLCQhw5cgQ+Pj74559/BExJXzIWFqQSaWlp6NevH/766y+YmJgAADIzM+Hh4YFdu3Yp/GdHROLzvuEnEomEm+W9Q25uLiZMmICgoCAAbyaA2tnZYcKECahWrRp8fHwETqgeXr58ib1798Lf3x9xcXFISUnhqkbvoKGh8c6eb4lEgnnz5slXLCP61Hj7hFRiwoQJePHiBa5fvy6/s3njxg0MGTIEEydOxM6dOwVOSETvkpiYKHQEtTZz5kzExMTgr7/+QseOHeXtbdu2xdy5c1lYvMf58+exefNm7NmzB46Ojhg2bBgGDBjAouI9IiIiIJPJ0Lp1a+zbt09haWhtbW1YW1ujatWqAiZUHzk5OVi8eDHCwsKQlpYGqVSqcPzevXsCJRM39liQShgbG+PkyZP46quvFNovXryI9u3bIzMzU5hgIlahQgUkJCSgUqVKMDU1feddp4yMjM+YjIg+lLW1NXbv3o2vv/4ahoaGiImJgZ2dHe7cuYP69etzP59iLF26FIGBgXj69CkGDhyIYcOGcajdR7h//z6srKxKPG+PlPXv3x+RkZEYPHgwLCwslK7lpEmTBEombuyxIJWQSqXQ0tJSatfS0lKq+umNFStWyFfwWLlypbBhiKhUnjx5ojRxFnhzF5Rv9orn4+Mj3y1aIpEgMDCwyPO4Is+7WVtb4/Tp01i/fj3u3buHvXv3olq1ati6dStsbW3RvHlzoSOK3rFjx/Dnn39yoYUPxMKCVKJ169aYNGkSdu7cKe92ffToEaZMmYI2bdoInE6cYmJi0Lt3b+jo6MDW1hZNmzblZE8iNdWwYUP8+eefmDBhAgDIi4mNGzeiSZMmQkYTtZYtW0IikeD69evFnsPC7P327duHwYMHY+DAgYiOjkZeXh4A4Pnz51i4cCGOHj0qcELxMzU1VRhKRiXDoVCkEg8ePEC3bt1w/fp1WFpayttq166Nw4cPo3r16gInFB8tLS08fPgQ5ubmxS4VSETq4cyZM+jUqRMGDRqEwMBAjB49Gjdu3MC5c+cQGRmJBg0aCB2RvmBubm6YMmUKvLy8FIbiXblyBZ06dUJKSorQEUVv27ZtOHToEIKCgqCvry90HLXB26GkEpaWloiOjsbJkycRHx8PAHB2dkbbtm0FTiZeNjY2WL16Ndq3bw+ZTIbz58/D1NS0yHNbtmz5mdMR0Ydo3rw5rl69isWLF8PV1RXHjx9H/fr1cf78ebi6ugodj75wt27dKvJ1wtjYmHMc3+G/+0nduXMH5ubmsLGxURrezf2kisbCglRGIpGgXbt2aNeundBR1MLPP/+MMWPGYNGiRZBIJOjRo0eR53F5TyL1UKNGDWzYsEGpPTg4GL179xYgEZUVVapUwZ07d2BjY6PQfubMGdjZ2QkTSg38d0NQ+nAcCkWfnFQqRWBgIPbv34+kpCRIJBLY2tqid+/eGDx4MMfHvkd2djaMjIxw69atYodCGRsbf+ZURP/P1tYWrVu3xvz587l0ZREKCgoQHx8PbW1tODo6ytsPHToEX19fxMfHy8e8E6nCokWLsG3bNmzevBnt2rXD0aNHcf/+fUyZMgWzZ8+Wz/0h+tTevQMS0QeSyWTo1q0bRo4ciUePHsHV1RW1atXC/fv3MXTo0GLvwtP/MzAwQEREBGxtbWFsbFzkg0hIQ4YMQWFhIVdLKcK1a9dgb2+PunXrwtnZGT179kRqairc3d0xfPhwdOrUCXfv3hU6Jn3hfHx8MGDAALRp0wbZ2dlo2bIlRo4cidGjR7OoKCE7Ozukp6crtWdmZrLX5x3YY0GfVEBAACZNmoRDhw7Bw8ND4Vh4eDi6d++ONWvWwMvLS6CERESq07lzZ+Tl5WHy5MnYuXMndu7cCScnJ4wYMQLjxo2Dnp6e0BGpDMnPz8edO3eQnZ0NFxcXGBgYCB1JbWhoaCAlJUVp5EBqaiosLS2Rn58vUDJxY2FBn1T79u3RunXrYneVXbhwISIjIxEaGvqZkxFRaRQWFiIuLg7W1tbFLipAgJmZGY4fP4569erh+fPnMDU1RVBQEAYPHix0NLWUm5uL5ORkpTdx3DTvw2RlZSE8PBxOTk5wdnYWOo6oHT58GMCb+RZBQUEKowQKCwsRFhaGEydO4NatW0JFFDUWFvRJValSBSEhIahXr16Rx7nUHZF6mDx5MlxdXTFixAgUFhbC3d0d586dg76+Pv744w+0atVK6Iii9N+7nIaGhoiOjoaDg4PAydTLkydPMGzYMBw7dqzI41zA4t369OmDli1bYvz48Xj58iXq1auHxMREyGQy7Nq1C7169RI6omhpaLyZJSCRSPDft8haWlqwsbHBsmXL0KVLFyHiiR7nWNAnlZGRAXNz82KPm5ub49mzZ58xERF9jODgYNStWxcAcOTIESQmJiI+Ph5TpkzBrFmzBE4nXhKJBC9evEBWVhaeP38OiUSCly9fIisrS+FB7zZ58mRkZmbiwoUL0NPTQ0hICIKCguDg4CC/o0zFO3XqFFq0aAEAOHDgAKRSKTIzM7F69WosWLBA4HTiJpVKIZVKYWVlhbS0NPlzqVSKvLw83Lp1i0XFO7CwoE+qsLDwnbtFa2pqoqCg4DMmUk/Dhw/HixcvlNpzcnIwfPhwARJRWfP06VNUqVIFAHD06FF8++23cHR0xPDhwxEXFydwOvGSyWRwdHSU79qbnZ0NNzc3mJqawtTUFCYmJhxKVgLh4eFYvnw5GjZsCA0NDVhbW2PQoEFYunQpFi1aJHQ80Xv+/Ll81+iQkBD06tUL+vr66Ny5M27fvi1wOvWQmJiISpUqCR1D7XAfC/qkZDIZhg4dCh0dnSKPc4nFkgkKCsLixYthaGio0P7y5Uts2bIFmzdvFigZlRXm5ua4ceMGLCwsEBISgrVr1wJ4M+ZdU1NT4HTiFRERIXSEL0JOTo58OJmpqSmePHkCR0dHuLq6cmOyErC0tMT58+dRoUIFhISEYNeuXQCAZ8+eQVdXV+B04ubp6YmdO3fK51YsXrwYY8aMgYmJCQAgPT0dLVq0wI0bNwRMKV4sLOiTGjJkyHvP4YpQxcvKyoJMJoNMJsOLFy8UXgAKCwtx9OjRYve2IPqUhg0bhj59+sDCwgISiQRt27YFAFy4cAE1a9YUOJ14ubu7Cx3hi+Dk5IRbt27BxsYGdevWxfr162FjY4N169bBwsJC6HiiN3nyZAwcOBAGBgawtraWz4k6deoUd35/j9DQUIWboAsXLkSfPn3khUVBQQEnbr8DCwv6pAICAoSOoNZMTEwgkUggkUgUNtZ6SyKRYN68eQIko7Jm7ty5cHV1RXJyMr799lt5L6Smpmaxq74RfSqTJk3C48ePAQBz5sxBx44dsX37dmhrayMwMFDYcGpg7NixaNy4MZKTk9GuXTv5hGQ7OzvOsXiP/07Y5hpHH4arQhGJSGRkJGQyGVq3bo19+/bJx8gCgLa2NqytrbnTManc69ev0bFjR6xbt46rGZEo5ObmIj4+HlZWVhz3TipV1MpuMTEx8k3xUlNTUbVqVa5MVgz2WBCJyNthFImJibC0tJTfZSL6nLS0tBAbGyt0DCrD/Pz8MG3aNOjr6wMA9PX1Ub9+fbx8+RJ+fn7w9fUVOCF9qd6OGvhvG5UMeyyIRCozMxMXL16UL3f3b5ynQqo2ZcoU6OjoYPHixUJHoTJIU1MTjx8/VppTlp6eDjMzM94tJpXR0NBAp06d5MM/jxw5gtatW6N8+fIA3ixCExISwt/BYrDHgkiEjhw5goEDByI7OxtGRkYKd0skEgkLC1K5goICbN68GSdPnkSDBg3kL6pvLV++XKBkVBbIZLIi7xLHxMQoDBEl+tT+uwjNoEGDlM7ha3Dx2GNBJEKOjo7w9PTEwoUL5UMBiD4nDw+PYo9JJBKEh4d/xjTqJycnB4sXL0ZYWFiRvY737t0TKJm4mZqaQiKR4Pnz50o3VQoLC5GdnY0xY8bA399fwJREVBwWFkQiVL58ecTFxcknixGReunfvz8iIyMxePBg+ZK9/zZp0iSBkolbUFAQZDIZhg8fjpUrV8r3EgDeLGBhY2ODJk2aCJhQveTm5iI5ORn5+fkK7XXq1BEoEX3pWFgQiVDPnj3Rr18/9OnTR+goRHj48CEAoHr16gInUR8mJib4888/0axZM6GjqKXIyEg0bdoUWlpaQkdRS0+ePMGwYcNw7NixIo9zfgCpCudYEIlQ586dMX36dNy4cQOurq5KL67dunUTKBmVFVKpFAsWLMCyZcuQnZ0N4M2yi1OnTsWsWbO4Ytl7mJqaci5AKfx7o8FXr14p3XE3MjL63JHUyuTJk5GZmYkLFy6gVatWOHDgAFJTU+V/00Sqwh4LIhF615s2iUTCu02kcjNnzsSmTZswb948+V33M2fOYO7cuRg1ahR++ukngROK27Zt23Do0CEEBQVxntRHyM3NxYwZM7Bnzx6kp6crHef/ge9mYWGBQ4cOoVGjRjAyMsKlS5fg6OiIw4cPY+nSpThz5ozQEekLxR4LIhH670RPos8tKCgIGzduVOgdq1OnDqpVq4axY8eysHiPZcuW4e7duzA3N4eNjY1Sr2N0dLRAydTD9OnTERERgbVr12Lw4MHw9/fHo0ePsH79ei6BXAI5OTnypXpNTU3x5MkTODo6wtXVlb97pFIsLIhE7tWrV9DV1RU6BpUxGRkZqFmzplJ7zZo1kZGRIUAi9dK9e3ehI6i1I0eOYMuWLWjVqhWGDRuGFi1awN7eHtbW1ti+fTsGDhwodERRc3Jywq1bt2BjY4O6deti/fr1sLGxwbp162BhYSF0PPqCcSgUkQgVFhZi4cKFWLduHVJTU5GQkAA7OzvMnj0bNjY2GDFihNAR6QvXuHFjNG7cGKtXr1ZonzBhAqKiovD3338LlIzKAgMDA9y4cQNWVlaoXr069u/fj0aNGiExMRGurq7yeT9UtG3btqGgoABDhw7F5cuX0bFjR2RkZEBbWxuBgYHo27ev0BHpC8UeCyIR+umnnxAUFISlS5di1KhR8vbatWtj5cqVLCxI5ZYuXYrOnTvj5MmT8uU9z58/jwcPHuDo0aMCp1MPmZmZCA4Oxt27dzF9+nRUqFAB0dHRMDc3R7Vq1YSOJ2p2dnZITEyElZUVatasiT179qBRo0Y4cuQITExMhI4nev/e1K1Bgwa4f/8+4uPjYWVlhUqVKgmYjL50XNaDSIS2bNmC33//HQMHDoSmpqa8vW7duoiPjxcwGZUV7u7uSEhIQI8ePZCZmYnMzEz07NkTt27dQosWLYSOJ3qxsbFwdHTEkiVL8MsvvyAzMxMAsH//fsycOVPYcGpg2LBhiImJAQD4+PjA398furq6mDJlCqZPny5wOvHz8/NDbm6u/Lm+vj7q16+P8uXLw8/PT8Bk9KXjUCgiEdLT00N8fDysra1haGiImJgY2NnZ4caNG2jUqBGHAZDKJScnw9LSUmljt7fHrKysBEilPtq2bYv69etj6dKlCn/D586dw4ABA5CUlCR0RLVy//59XL58Gfb29tzcrQQ0NTXx+PFj+QTut9LT02FmZsZVtUhlOBSKSIRcXFxw+vRpWFtbK7QHBwfDzc1NoFRUltja2hb7xsTW1pZvTN4jKioK69evV2qvVq0aUlJSBEik3qytrZX+P6TiyWSyIm8KxMTEcH8VUikWFkQi5OvriyFDhuDRo0eQSqXYv38/bt26hS1btuCPP/4QOh6VAcW9McnOzuYqZSWgo6ODrKwspfaEhARUrlxZgETqQyqVIjAwEPv370dSUhIkEglsbW3Ru3dvDB48uMjfS3rD1NQUEokEEokEjo6OCteqsLAQ2dnZGDNmjIAJ6UvHoVBEInX69Gn4+fkhJiYG2dnZqF+/Pnx9fdG+fXuho9EXzNvbGwCwatUqjBo1SmFzt8LCQly4cAGampo4e/asUBHVwsiRI5Geno49e/agQoUKiI2NhaamJrp3746WLVti5cqVQkcUJZlMhq5du+Lo0aOoW7cuatasCZlMhps3byIuLg7dunXDwYMHhY4pWkFBQZDJZBg+fDhWrlwJY2Nj+TFtbW3Y2NjIF2MgUgUWFkREJOfh4QEAiIyMRJMmTaCtrS0/9vaNybRp0+Dg4CBURLXw/Plz9O7dG5cuXcKLFy9QtWpVpKSkoEmTJjh69CjKly8vdERRCggIwKRJk3Do0CH57+Jb4eHh6N69O9asWQMvLy+BEqqHyMhING3aVGljRiJVY2FBJEJ2dnaIiopCxYoVFdozMzNRv3593Lt3T6BkVFYMGzYMq1atgpGRkdBR1NqZM2cQGxsr73Vs27at0JFErX379mjdujV8fHyKPL5w4UJERkYiNDT0MydTX69evUJ+fr5CG/+uSVVYWBCJkIaGBlJSUpQmzqampsLKygp5eXkCJSMiUp0qVaogJCQE9erVK/L4lStX0KlTJ06Af4/c3FzMmDEDe/bsQXp6utJxLr5AqsLJ20QicvjwYfnHoaGhCuNjCwsLERYWBhsbGwGSUVl06dIl7NmzB8nJyUp3PPfv3y9QKvURFhaGFStW4ObNmwAAZ2dnTJ48mb0W75CRkQFzc/Nij5ubm+PZs2efMZF6mj59OiIiIrB27VoMHjwY/v7+ePToEdavX4/FixcLHY++YOyxIBIRDY03e1ZKJBL8909TS0sLNjY2WLZsGbp06SJEPCpDdu3aBS8vL3To0AHHjx9H+/btkZCQgNTUVPTo0QMBAQFCRxS13377DZMmTULv3r3lk2X//vtvBAcHY8WKFRg3bpzACcVJU1MTKSkpxa6clZqaiqpVq/KO+3tYWVlhy5YtaNWqFYyMjBAdHQ17e3ts3boVO3fuxNGjR4WOSF8oFhZEImRra4uoqChUqlRJ6ChURtWpUwejR4/GuHHj5Bu82draYvTo0bCwsMC8efOEjihq1atXh4+PD8aPH6/Q7u/vj4ULF+LRo0cCJRM3DQ0NdOrUCTo6OkUez8vLQ0hICAuL9zAwMMCNGzdgZWWF6tWrY//+/WjUqBESExPh6urKTVZJZTSEDkBEyhITE1lUkKDu3r2Lzp07A3izGlROTg4kEgmmTJmC33//XeB04peZmYmOHTsqtbdv3x7Pnz8XIJF6GDJkCMzMzGBsbFzkw8zMjCtClYCdnR0SExMBADVr1sSePXsAAEeOHIGJiYmAyehLxzkWRCKVk5ODyMjIIse3T5w4UaBUVFaYmprixYsXAN7sFn3t2jW4uroiMzMTubm5AqcTv27duuHAgQOYPn26QvuhQ4c4lPEdOMTu0xg2bBhiYmLg7u4OHx8fdO3aFWvWrMHr16+xfPlyoePRF4xDoYhE6MqVK/D09ERubi5ycnJQoUIFPH36FPr6+jAzM+Nys6RyAwYMQMOGDeHt7Y358+fj119/xTfffIMTJ06gfv36nLz9HgsWLMAvv/yCZs2aKcyxOHv2LKZOnaqw3CdvFJCq3b9/H5cvX4a9vT3q1KkjdBz6grGwIBKhVq1awdHREevWrYOxsTFiYmKgpaWFQYMGYdKkSejZs6fQEekLl5GRgVevXqFq1aqQSqVYunQpzp07BwcHB/z4448wNTUVOqKo2dralug8iUTCGwVE9MVgYUEkQiYmJrhw4QKcnJxgYmKC8+fPw9nZGRcuXMCQIUMQHx8vdET6ghUUFGDHjh3o0KHDO5f+JCLxkUqlCAwMxP79+5GUlASJRAJbW1v07t0bgwcPhkQiEToifcE4eZtIhLS0tORLz5qZmSE5ORkAYGxsjAcPHggZjcqAcuXKYcyYMXj16pXQUdTe06dP8fTpU6FjUBkhk8nQrVs3jBw5Eo8ePYKrqytq1aqF+/fvY+jQoejRo4fQEekLx8KCSITc3NwQFRUFAHB3d4evry+2b9+OyZMno3bt2gKno7KgUaNGuHr1qtAx1FJmZibGjRuHSpUqwdzcHObm5qhUqRLGjx+PzMxMoePRFywwMBCnTp1CWFgYrly5gp07d2LXrl2IiYnByZMnER4eji1btggdk75gHApFJEKXLl3Cixcv4OHhgbS0NHh5ecnHt2/evBl169YVOiJ94fbs2YOZM2diypQpaNCgAcqXL69wnBNAi5aRkYEmTZrg0aNHGDhwIJydnQEAN27cwI4dO2BpaYlz585xjgqpRPv27dG6dWv4+PgUeXzhwoWIjIxEaGjoZ05GZQULCyIiUvJ2KN6/vd0RXiKRcIOyYkyePBlhYWE4efKk0vyUlJQUtG/fHm3atMGKFSsESkhfsipVqiAkJAT16tUr8viVK1fQqVMnpKSkfN5gVGawsCASsbS0NNy6dQvAm02OKleuLHAiKivu37//zuPW1tafKYl6sbGxwfr169GhQ4cij4eEhGDMmDFISkr6vMGoTNDW1sb9+/dhYWFR5PF//vkHtra2yMvL+8zJqKzgBnlEIvTixQuMHTsWu3btkt8Z1tTURN++feHv7w9jY2OBE9KX7v79+2jatCnKlVN8mSgoKMC5c+dYWBTj8ePHqFWrVrHHa9euzbvFpDKFhYVKf7P/pqmpiYKCgs+YiMoaFhZEIjRy5EhcuXIFf/zxh3xzrfPnz2PSpEkYPXo0du3aJXBC+tJ5eHjg8ePHMDMzU2h//vw5PDw8OBSqGJUqVUJSUhKqV69e5PHExERUqFDhM6eiskImk2Ho0KHQ0dEp8jh7KkjVOBSKSITKly+P0NBQNG/eXKH99OnT6NixI3JycgRKRmWFhoYGUlNTlYbfJSQkoGHDhsjKyhIombgNHz4cd+/exYkTJ6Ctra1wLC8vDx06dICdnR02b94sUEL6kg0bNqxE5wUEBKg4CZVV7LEgEqGKFSsWOdzJ2NiYq8mQSr3d1V0ikSjd+SwsLERsbCyaNm0qVDzR8/PzQ8OGDeHg4IBx48ahZs2akMlkuHnzJn777Tfk5eVh69atQsekLxQLBhIaCwsiEfrxxx/h7e2NrVu3okqVKgDerCgzffp0zJ49W+B09CV7W9DKZDIYGhpCT09PfkxbWxtff/01Ro0aJVQ80atevTrOnz+PsWPHYubMmXg7KEAikaBdu3ZYs2YNLC0tBU5JRKQaHApFJEJubm64c+cO8vLyYGVlBQBITk6Gjo4OHBwcFM6Njo4WIiJ94ebNm4dp06Yp7V9BJffs2TPcvn0bAGBvb8+5FUT0xWOPBZEIde/eXegIVMbNmDED/77vdP/+fRw4cAAuLi5o3769gMnUh6mpKRo1aiR0DCKiz4Y9FkREpKR9+/bo2bMnxowZg8zMTDg5OUFbWxtPnz7F8uXL8f333wsdkYiIREZ5a1UiIirzoqOj0aJFCwBAcHAwqlSpgvv372PLli1YvXq1wOmIiEiMWFgQEZGS3NxcGBoaAgCOHz+Onj17QkNDA19//fV7d+UmIqKyiYUFEREpsbe3x8GDB/HgwQOEhobK51WkpaXByMhI4HRERCRGLCyIiEiJr68vpk2bBhsbGzRu3Fi+A/zx48fh5uYmcDoiIhIjTt4mIqIipaSk4PHjx6hbty40NN7ch7p48SKMjIxQs2ZNgdMREZHYsLAgUjN+fn7w8PCQT6wlIiIiEgMWFkRqxtbWFqmpqWjTpg2OHDkidBz6QuXk5GDx4sUICwtDWloapFKpwvF79+4JlIyIiMSKG+QRqZnExES8fPkSERERQkehL9jIkSMRGRmJwYMHw8LCAhKJROhIREQkcuyxICIiJSYmJvjzzz/RrFkzoaMQEZGa4KpQRCIUEhKCM2fOyJ/7+/ujXr16GDBgAJ49eyZgMiorTE1NUaFCBaFjEBGRGmFhQSRC06dPR1ZWFgAgLi4OU6dOhaenJxITE+Ht7S1wOioL5s+fD19fX+Tm5godhYiI1ASHQhGJkIGBAa5duwYbGxvMnTsX165dQ3BwMKKjo+Hp6YmUlBShI9IXzs3NDXfv3oVMJoONjQ20tLQUjkdHRwuUjIiIxIqTt4lESFtbW36n+OTJk/Dy8gIAVKhQQd6TQaRK3bt3FzoCERGpGfZYEIlQt27dkJ+fj2bNmmH+/PlITExEtWrVcPz4cYwfPx4JCQlCRyQiIiJSwDkWRCK0Zs0alCtXDsHBwVi7di2qVasGADh27Bg6duwocDoqSy5fvoxt27Zh27ZtuHLlitBxiIhIxNhjQUREStLS0tCvXz/89ddfMDExAQBkZmbCw8MDu3btQuXKlYUNSEREosMeCyKRunv3Ln788Uf0798faWlpAN70WFy/fl3gZFQWTJgwAS9evMD169eRkZGBjIwMXLt2DVlZWZg4caLQ8YiISITYY0EkQpGRkejUqROaNWuGU6dO4ebNm7Czs8PixYtx6dIlBAcHCx2RvnDGxsY4efIkvvrqK4X2ixcvon379sjMzBQmGBERiRZ7LIhEyMfHBwsWLMCJEyegra0tb2/dujX+/vtvAZNRWSGVSpWWmAUALS0tSKVSARIREZHYsbAgEqG4uDj06NFDqd3MzAxPnz4VIBGVNa1bt8akSZPwzz//yNsePXqEKVOmoE2bNgImIyIisWJhQSRCJiYmePz4sVL7lStX5CtEEanSmjVrkJWVBRsbG9SoUQM1atSAra0tsrKy8Ouvvwodj4iIRIgb5BGJUL9+/fDDDz9g7969kEgkkEqlOHv2LKZNmybfLI9IlSwtLREdHY2TJ08iPj4eAODs7Iy2bdsKnIyIiMSKk7eJRCg/Px/jxo1DYGAgCgsLUa5cORQWFmLAgAEIDAyEpqam0BGJiIiIFLCwIBKxBw8eIC4uDtnZ2XBzc4ODg4PQkagMiYqKQkREBNLS0pQmbC9fvlygVEREJFYcCkUkQn5+fpg2bRosLS1haWkpb3/58iV+/vln+Pr6CpiOyoKFCxfixx9/hJOTE8zNzSGRSOTH/v0xERHRW+yxIBIhTU1NPH78GGZmZgrt6enpMDMzQ2FhoUDJqKwwNzfHkiVLMHToUKGjEBGRmuCqUEQiJJPJirwrHBMTgwoVKgiQiMoaDQ0NNGvWTOgYRESkRjgUikhETE1NIZFIIJFI4OjoqFBcFBYWIjs7G2PGjBEwIZUVU6ZMgb+/P1auXCl0FCIiUhMcCkUkIkFBQZDJZBg+fDhWrlwJY2Nj+TFtbW3Y2NigSZMmAiakskIqlaJz585ISEiAi4uL0i7c+/fvFygZERGJFXssiERkyJAhAABbW1s0bdpU6c0c0ecyceJEREREwMPDAxUrVuSEbSIiei/2WBCJRFZWFoyMjOQfv8vb84hUxdDQELt27ULnzp2FjkJERGqCPRZEImFqaipfCcrExKTIO8RvJ3VzVShStQoVKqBGjRpCxyAiIjXCwoJIJMLDw+UrPkVERAichsq6uXPnYs6cOQgICIC+vr7QcYiISA1wKBQRESlxc3PD3bt3IZPJYGNjozTfJzo6WqBkREQkVuyxIBKhkJAQGBgYoHnz5gAAf39/bNiwAS4uLvD394epqanACelL1717d6EjEBGRmmGPBZEIubq6YsmSJfD09ERcXBwaNmyIqVOnIiIiAjVr1kRAQIDQEYmIiIgUsMeCSIQSExPh4uICANi3bx+6du2KhQsXIjo6Gp6engKno7IiMzMTwcHBuHv3LqZPn44KFSogOjoa5ubmqFatmtDxiIhIZFhYEImQtrY2cnNzAQAnT56El5cXgDcr9bxvKVqiTyE2NhZt27aFsbExkpKSMGrUKFSoUAH79+9HcnIytmzZInREIiISGQ2hAxCRsubNm8Pb2xvz58/HxYsX5XsJJCQkoHr16gKno7LA29sbQ4cOxe3bt6Grqytv9/T0xKlTpwRMRkREYsXCgkiE1qxZg3LlyiE4OBhr166VDzs5duwYOnbsKHA6KguioqIwevRopfZq1aohJSVFgERERCR2HApFJEJWVlb4448/lNpXrFghQBoqi3R0dIocdpeQkIDKlSsLkIiIiMSOhQWRSBUWFuLAgQO4efMmAMDZ2Rndu3dHuXL8syXV69atG/z8/LBnzx4AgEQiQXJyMn744Qf06tVL4HRERCRGXG6WSISuX7+Orl27IjU1FU5OTgD+/07xkSNHULt2bYET0pfu+fPn6N27Ny5duoQXL16gatWqSElJQZMmTXD06FGUL19e6IhERCQyLCyIRKhJkyaoXLkygoKC5JvhPXv2DEOHDsWTJ09w7tw5gRNSWXHmzBnExsYiOzsb9evXR9u2bYWOREREIsXCgkiE9PT0cOnSJdSqVUuh/dq1a/jqq6/w8uVLgZIRERERFY2rQhGJkKOjI1JTU5Xa09LSYG9vL0AiKkukUik2b96MLl26oHbt2nB1dUW3bt2wZcsW8F4UEREVh4UFkUhkZWXJH4sWLcLEiRMRHByMhw8f4uHDhwgODsbkyZOxZMkSoaPSF0wmk6Fbt24YOXIkHj16BFdXV9SqVQv379/H0KFD0aNHD6EjEhGRSHEoFJFIaGhoQCKRyJ+//dN82/bv54WFhZ8/IJUJAQEBmDRpEg4dOgQPDw+FY+Hh4ejevTvWrFkj3w2eiIjoLRYWRCIRGRlZ4nPd3d1VmITKsvbt26N169bw8fEp8vjChQsRGRmJ0NDQz5yMiIjEjoUFERHJValSBSEhIahXr16Rx69cuYJOnTpx920iIlLCORZEInX69GkMGjQITZs2xaNHjwAAW7duxZkzZwRORl+yjIwMmJubF3vc3Nwcz549+4yJiIhIXbCwIBKhffv2oUOHDtDT00N0dDTy8vIAvNm0bOHChQKnoy9ZYWHhO3d319TUREFBwWdMRERE6oJDoYhEyM3NDVOmTIGXlxcMDQ0RExMDOzs7DkMhldPQ0ECnTp2go6NT5PG8vDyEhIRwAQEiIlJS/G0pIhLMrVu30LJlS6V2Y2NjZGZmfv5AVGYMGTLkvedwRSgiIioKCwsiEapSpQru3LkDGxsbhfYzZ87Azs5OmFBUJgQEBAgdgYiI1BTnWBCJ0KhRozBp0iRcuHABEokE//zzD7Zv345p06bh+++/FzoeERERkRL2WBCJkI+PD6RSKdq0aYPc3Fy0bNkSOjo6mDZtGiZMmCB0PCIiIiIlnLxNJCKJiYmwtbWVP8/Pz8edO3eQnZ0NFxcXGBgYCJiOiIiIqHgsLIhERENDA9bW1vDw8EDr1q3h4eGBatWqCR2LiIiI6L1YWBCJyF9//SV/XLhwAfn5+bCzs5MXGR4eHu/cvIyIiIhIKCwsiETq1atXOHfunLzQuHjxIl6/fo2aNWvi+vXrQscjIiIiUsDCgkjk8vPzcfbsWRw7dgzr169HdnY2NycjIiIi0WFhQSQy+fn5+PvvvxERESEfEmVpaYmWLVuiZcuWcHd3h5WVldAxiYiIiBSwsCASkdatW+PChQuwtbWFu7s7WrRoAXd3d1hYWAgdjYiIiOidWFgQiYiWlhYsLCzQvXt3tGrVCu7u7qhYsaLQsYiIiIjei4UFkYjk5OTg9OnT+OuvvxAREYGrV6/C0dER7u7u8kKjcuXKQsckIiIiUsLCgkjEXrx4gTNnzsjnW8TExMDBwQHXrl0TOhoRERGRAg2hAxBR8cqXL48KFSqgQoUKMDU1Rbly5XDz5k2hYxEREREpYY8FkYhIpVJcunRJPhTq7NmzyMnJQbVq1eQb5Hl4eMDa2lroqEREREQKWFgQiYiRkRFycnJQpUoVeRHRqlUr1KhRQ+hoRERERO/EwoJIRNavXw8PDw84OjoKHYWIiIjog7CwICIiIiKiUuPkbSIiIiIiKjUWFkREREREVGosLIiIiIiIqNRYWBARERERUamxsCAiIiIiolJjYUFERKIgkUje+Zg7d67QEYmI6B3KCR2AiIgIAB4/fiz/ePfu3fD19cWtW7fkbQYGBkLEIiKiEmKPBRERiUKVKlXkD2NjY0gkElSpUgWGhoZwdHRESEiIwvkHDx5E+fLl8eLFCyQlJUEikWDXrl1o2rQpdHV1Ubt2bURGRip8zrVr19CpUycYGBjA3NwcgwcPxtOnTz/nj0lE9MViYUFERKJWvnx59OvXDwEBAQrtAQEB6N27NwwNDeVt06dPx9SpU3HlyhU0adIEXbt2RXp6OgAgMzMTrVu3hpubGy5duoSQkBCkpqaiT58+n/XnISL6UrGwICIi0Rs5ciRCQ0Plw6XS0tJw9OhRDB8+XOG88ePHo1evXnB2dsbatWthbGyMTZs2AQDWrFkDNzc3LFy4EDVr1oSbmxs2b96MiIgIJCQkfPafiYjoS8PCgoiIRK9Ro0aoVavW/7Vvv67JRXEcxz+OseLCBIMOQQWDBpkMy8qizbImCCIODBYXBP0P7oyCeQztA5vJoiDIGAv+KIIWba7Iwpj4pEcQnzC4D/7a+wUHDueeC9+TLh/O/er5+VmSVK1W5Xa7dXt7u7bv5uZmNT89PVU4HFa/35ckvb+/q9Fo6Pz8fDX8fr8kaTgcbukkAHC8aN4GAByE+/t7lctl5fN5PT09KZlMymKx/Pj9+XyuaDSqx8fHjWdOp/N/lgoAvxI3FgCAgxCPxzUej1UqldTr9ZRIJDb2tNvt1fz7+1uvr68KBAKSpOvra3W7XXk8Hvl8vrVhtVq3dg4AOFYECwDAQbDZbLq7u1Mul1MkEpHL5drYUy6X9fLyosFgoEwmo4+Pj1UfRiaT0Ww2UywWU6fT0XA4VL1eVzKZ1GKx2PZxAODoECwAAAcjlUrp6+tro2n7L8MwZBiGrq6u1Gw2VavVZLfbJUmXl5dqtVpaLBaKRCIKBoPKZrO6uLjQyQmfQwAwy7JcLpe7LgIAgJ+oVCp6eHjQZDLR2dnZan00Gsnr9ert7U2hUGh3BQLAL0bzNgBg731+fmo6ncowDKXT6bVQAQDYD9z9AgD2XrFYlN/vl8PhUKFQ2HU5AIB/4FcoAAAAAKZxYwEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwLQ/cy5tIOTXInQAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "TypeError", "evalue": "can only concatenate str (not \"int\") to str", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[40], line 90\u001b[0m\n\u001b[0;32m 88\u001b[0m years_pub \u001b[38;5;241m=\u001b[39m pubs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlist_publishedYear\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mto_list()\n\u001b[0;32m 89\u001b[0m plt\u001b[38;5;241m.\u001b[39mfigure()\n\u001b[1;32m---> 90\u001b[0m plt\u001b[38;5;241m.\u001b[39mhist(years_pub, bins\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mmin\u001b[39m(years_pub), \u001b[38;5;28;43mmax\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43myears_pub\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m))\n\u001b[0;32m 91\u001b[0m plt\u001b[38;5;241m.\u001b[39mtitle(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPublications by Year\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 92\u001b[0m plt\u001b[38;5;241m.\u001b[39mxlabel(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYear\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[1;31mTypeError\u001b[0m: can only concatenate str (not \"int\") to str" ] }, { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import math\n", "# ── 6. SME involvement rate ────────────────────────────────────────────────\n", "sme_lists = df[\"list_SME\"].drop_nulls().to_list()\n", "sme_flag = [any(lst) for lst in sme_lists]\n", "count_sme = sum(sme_flag)\n", "count_non = len(sme_flag) - count_sme\n", "\n", "plt.figure()\n", "plt.bar([\"With SME\", \"Without SME\"], [count_sme, count_non])\n", "plt.title(\"SME Involvement in Projects\")\n", "plt.ylabel(\"Number of Projects\")\n", "plt.show()\n", "\n", "\n", "# ── 7. EC contribution ratio histogram ────────────────────────────────────\n", "raw_ratios = df[\"ecRatio\"].drop_nulls().to_list()\n", "ec_ratios = [r for r in raw_ratios if math.isfinite(r)]\n", "\n", "plt.figure()\n", "plt.hist(ec_ratios, bins=30)\n", "plt.title(\"EC Contribution / Total Cost Ratio\")\n", "plt.xlabel(\"Ratio\")\n", "plt.ylabel(\"Count\")\n", "plt.show()\n", "\n", "\n", "# ── 8. Correlation matrix of financials ───────────────────────────────────\n", "fin_df = (\n", " df\n", " .select([\"ecMaxContribution\", \"netEcContribution\", \"totalCost\"])\n", " .drop_nulls()\n", " .to_pandas()\n", ")\n", "corr = fin_df.corr()\n", "plt.figure(figsize=(6,6))\n", "plt.imshow(corr.values, cmap=\"RdBu\", vmin=-1, vmax=1)\n", "plt.title(\"Correlation Matrix of Financial Columns\")\n", "plt.xticks(range(len(corr)), corr.columns, rotation=90)\n", "plt.yticks(range(len(corr)), corr.columns)\n", "plt.colorbar()\n", "plt.show()\n", "\n", "\n", "# ── 9. Deliverables per project ────────────────────────────────────────────\n", "deliv_counts = [len(lst) for lst in df[\"list_deliverableType\"].drop_nulls().to_list()]\n", "plt.figure()\n", "plt.hist(deliv_counts, bins=range(max(deliv_counts)+2))\n", "plt.title(\"Number of Deliverables per Project\")\n", "plt.xlabel(\"Deliverable Count\")\n", "plt.ylabel(\"Projects\")\n", "plt.show()\n", "\n", "\n", "# ── 10. Deliverable types frequency ───────────────────────────────────────\n", "types = (\n", " df.explode(\"list_deliverableType\")\n", " .filter(pl.col(\"list_deliverableType\").is_not_null())\n", " .group_by(\"list_deliverableType\")\n", " .count()\n", " .sort(\"count\", descending=True)\n", ")\n", "plt.figure(figsize=(8,4))\n", "plt.bar(types[\"list_deliverableType\"].to_list(),\n", " types[\"count\"].to_list())\n", "plt.title(\"Deliverable Types\")\n", "plt.xlabel(\"Type\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "\n", "# ── 11. Publications per project ──────────────────────────────────────────\n", "pub_counts = [len(lst) for lst in df[\"list_doi\"].drop_nulls().to_list()]\n", "plt.figure()\n", "plt.hist(pub_counts, bins=range(max(pub_counts)+2))\n", "plt.title(\"Publications per Project\")\n", "plt.xlabel(\"Publication Count\")\n", "plt.ylabel(\"Projects\")\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "import matplotlib.pyplot as plt\n", "import pathlib\n", "from itertools import combinations\n", "import networkx as nx\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.cluster import KMeans\n", "from sklearn.manifold import TSNE\n", "import folium\n", "\n", "# Paths\n", "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n", "OUTDIR = ROOT / \"combined\"\n", "PARQUET = OUTDIR / \"consolidated.parquet\"\n", "PROJECT_PUB = OUTDIR / \"projectPublications_all.parquet\"\n", "\n", "# Load consolidated dataset\n", "df = pl.read_parquet(PARQUET)\n", "\n", "for col in [\"startDate\", \"endDate\"]:\n", " if df[col].dtype == pl.Utf8:\n", " df = df.with_column(\n", " pl.col(col).str.strptime(pl.Date, \"%Y-%m-%d\").alias(col)\n", " )\n", "\n", "df = df.with_columns(\n", " pl.col(\"list_netEcContribution\").list.eval(pl.element().cast(pl.Float64),parallel=True)\n", " .list.sum().alias(\"netEcContribution\")\n", ")\n", "\n", "df = df.with_columns(\n", " pl.col(\"totalCost\").cast(pl.Float64),\n", " pl.col(\"netEcContribution\").cast(pl.Float64)\n", ")\n", "\n", "df = df.with_columns([\n", " pl.col(\"startDate\").dt.year().alias(\"startYear\"),\n", " pl.col(\"endDate\"). dt.year().alias(\"endYear\"),\n", " (pl.col(\"endDate\") - pl.col(\"startDate\")).dt.total_days().alias(\"durationDays\"),\n", " (pl.col(\"netEcContribution\") / pl.col(\"totalCost\")).alias(\"ecRatio\"),\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ── 12. Yearly publication trends ─────────────────────────────────────────\n", "pubs = (\n", " df\n", " .explode(\"list_publishedYear\")\n", " .filter(pl.col(\"list_publishedYear\").is_not_null())\n", ")\n", "\n", "pubs_by_year = (\n", " pubs\n", " .with_columns(\n", " pl.col(\"list_publishedYear\")\n", " .cast(pl.Int32, strict=False) \n", " .alias(\"year_int\")\n", " )\n", " .filter(pl.col(\"year_int\").is_not_null())\n", " .group_by(\"year_int\")\n", " .len()\n", " .sort(\"year_int\")\n", ")\n", "\n", "# 3) Bar‑plot the small table\n", "years = pubs_by_year[\"year_int\"].to_list()\n", "counts = pubs_by_year[\"count\"].to_list()\n", "\n", "plt.figure(figsize=(8,4))\n", "plt.bar(years, counts)\n", "plt.title(\"Publications by Year\")\n", "plt.xlabel(\"Year\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=45)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "# ── 13. Topic frequency ───────────────────────────────────────────────────\n", "topics = (\n", " df.explode(\"list_topic\")\n", " .filter(pl.col(\"list_topic\").is_not_null())\n", " .group_by(\"list_topic\")\n", " .count()\n", " .sort(\"count\", descending=True)\n", " .head(20)\n", ")\n", "plt.figure(figsize=(8,4))\n", "plt.bar(topics[\"list_topic\"].to_list(), topics[\"count\"].to_list())\n", "plt.title(\"Top 20 Topics\")\n", "plt.xlabel(\"Topic\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "\n", "# ── 14. Legal basis distribution ──────────────────────────────────────────\n", "legal = (\n", " df.explode(\"list_legalBasis\")\n", " .filter(pl.col(\"list_legalBasis\").is_not_null())\n", " .group_by(\"list_legalBasis\")\n", " .count()\n", " .sort(\"count\", descending=True)\n", ")\n", "plt.figure(figsize=(8,4))\n", "plt.bar(legal[\"list_legalBasis\"].to_list(), legal[\"count\"].to_list())\n", "plt.title(\"Legal Basis Distribution\")\n", "plt.xlabel(\"Legal Basis\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "\n", "# ── 15. Web link status (archived vs. active) ────────────────────────────\n", "links = df.explode(\"list_archivedDate\").drop_nulls(subset=[\"list_archivedDate\"])\n", "archived_flags = [bool(d) for d in links[\"list_archivedDate\"].to_list()]\n", "count_arch = sum(archived_flags)\n", "count_act = len(archived_flags) - count_arch\n", "plt.figure()\n", "plt.bar([\"Archived\", \"Active\"], [count_arch, count_act])\n", "plt.title(\"Web Link Status\")\n", "plt.ylabel(\"Count\")\n", "plt.show()\n", "\n", "\n", "# ── 16. Language coverage ─────────────────────────────────────────────────\n", "langs = (\n", " df.explode(\"list_availableLanguages\")\n", " .filter(pl.col(\"list_availableLanguages\").is_not_null())\n", " .group_by(\"list_availableLanguages\")\n", " .count()\n", " .sort(\"count\", descending=True)\n", ")\n", "plt.figure(figsize=(8,4))\n", "plt.bar(langs[\"list_availableLanguages\"].to_list(),\n", " langs[\"count\"].to_list())\n", "plt.title(\"Web Link Language Coverage\")\n", "plt.xlabel(\"Language\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "\n", "# ── 17. Attachment types ──────────────────────────────────────────────────\n", "attach = (\n", " df.explode(\"list_attachment\")\n", " .filter(pl.col(\"list_attachment\").is_not_null())\n", ")\n", "exts = [pathlib.Path(x).suffix.lower() for x in attach[\"list_attachment\"].to_list()]\n", "ext_counts = {e: exts.count(e) for e in set(exts)}\n", "plt.figure()\n", "plt.bar(list(ext_counts.keys()), list(ext_counts.values()))\n", "plt.title(\"Attachment File Types\")\n", "plt.xlabel(\"Extension\")\n", "plt.ylabel(\"Count\")\n", "plt.xticks(rotation=45)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "\n", "# ── 18. Summary title lengths ─────────────────────────────────────────────\n", "titles = (\n", " df.explode(\"list_title_report\")\n", " .filter(pl.col(\"list_title_report\").is_not_null())\n", ")\n", "title_lens = [len(t) for t in titles[\"list_title_report\"].to_list()]\n", "plt.figure()\n", "plt.hist(title_lens, bins=30)\n", "plt.title(\"Report Title Lengths\")\n", "plt.xlabel(\"Characters\")\n", "plt.ylabel(\"Count\")\n", "plt.show()\n", "\n", "\n", "# ── 19. Topic vs. Funding: average totalCost by topic ─────────────────────\n", "avg_cost = (\n", " df.explode(\"list_topic\")\n", " .filter(pl.col(\"list_topic\").is_not_null() & pl.col(\"totalCost\").is_not_null())\n", " .group_by(\"list_topic\")\n", " .agg(pl.col(\"totalCost\").mean().alias(\"avgCost\"))\n", " .sort(\"avgCost\", descending=True)\n", " .head(20)\n", ")\n", "plt.figure(figsize=(8,4))\n", "plt.bar(avg_cost[\"list_topic\"].to_list(), avg_cost[\"avgCost\"].to_list())\n", "plt.title(\"Average Project Cost by Topic\")\n", "plt.xlabel(\"Topic\")\n", "plt.ylabel(\"Average Cost\")\n", "plt.xticks(rotation=90)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "\n", "# ── 20. Time to first publication ─────────────────────────────────────────\n", "pub_df = (\n", " pl.read_parquet(PROJECT_PUB)\n", " .with_columns(\n", " pl.col(\"contentUpdateDate\")\n", " .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\")\n", " .dt.date()\n", " .alias(\"pubDate\")\n", " )\n", " .drop_nulls(subset=[\"pubDate\", \"projectID\"])\n", ")\n", "first_pub = pub_df.group_by(\"projectID\").agg(pl.col(\"pubDate\").min().alias(\"firstPub\"))\n", "merged = df.join(first_pub, left_on=\"id\", right_on=\"projectID\", how=\"left\")\n", "lags = (\n", " merged\n", " .filter(pl.col(\"firstPub\").is_not_null() & pl.col(\"startDate\").is_not_null())\n", " .with_column((pl.col(\"firstPub\") - pl.col(\"startDate\")).dt.days().alias(\"lagDays\"))\n", " [\"lagDays\"]\n", " .to_list()\n", ")\n", "plt.figure()\n", "plt.hist(lags, bins=30)\n", "plt.title(\"Days from Start to First Publication\")\n", "plt.xlabel(\"Days\")\n", "plt.ylabel(\"Count\")\n", "plt.show()\n", "\n", "\n", "# ── 21. Objective text clustering (TF-IDF + KMeans + t-SNE) ─────────────\n", "objectives = [o for o in df[\"objective\"].drop_nulls().to_list() if o.strip()]\n", "vectorizer = TfidfVectorizer(max_features=500, stop_words=\"english\")\n", "X = vectorizer.fit_transform(objectives)\n", "kmeans = KMeans(n_clusters=5, random_state=42).fit(X)\n", "# only embed the first 500 for speed\n", "coords = TSNE(n_components=2, random_state=42).fit_transform(X[:500].toarray())\n", "\n", "plt.figure(figsize=(6,5))\n", "plt.scatter(coords[:,0], coords[:,1], s=10, alpha=0.6)\n", "plt.title(\"Objective Clusters (t-SNE)\")\n", "plt.xlabel(\"Dim 1\")\n", "plt.ylabel(\"Dim 2\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Romain\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/html": [ "\n", "
\n", " Upgrade to ydata-sdk\n", "

\n", " Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.\n", "

\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_22464\\834880225.py:1: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.\n", " from pandas_profiling import ProfileReport\n", "c:\\Users\\Romain\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\ydata_profiling\\profile_report.py:365: UserWarning: Try running command: 'pip install --upgrade Pillow' to avoid ValueError\n", " warnings.warn(\n", "Summarize dataset: 9%|▉ | 6/68 [00:04<00:34, 1.77it/s, Describe variable: list_deliverableType]" ] } ], "source": [ "from ydata_profiling import ProfileReport\n", "report = ProfileReport(\n", " consolidated.to_pandas(),\n", " title=\"Polars EDA Report\",\n", " explorative=True\n", ")\n", "report.to_file(\"polars_report.html\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "shape: (5, 62)
idacronymstatustitlestartDateendDatetotalCostecMaxContributionlegalBasistopicsecSignatureDateframeworkProgrammemasterCallsubCallfundingSchemenatureobjectivecontentUpdateDatercngrantDoilist_deliverableTypelist_urllist_contentUpdateDatelist_authorslist_titlelist_doilist_journalTitlelist_isPublishedAslist_publishedYearlist_contentUpdateDate_publilist_title_reportlist_attachmentlist_contentUpdateDate_reportlist_organisationIDlist_countrylist_namelist_SMElist_citylist_geolocationlist_organizationURLlist_rolelist_ecContributionlist_netEcContributionlist_totalCostlist_endOfParticipationlist_activityTypelist_contentUpdateDate_orglist_physUrllist_availableLanguageslist_statuslist_archivedDatelist_typelist_sourcelist_representslist_legalBasislist_title_legallist_uniqueProgrammePartlist_topiclist_title_topiclist_euroSciVocTitlelist_euroSciVocPathlist_description
i64strstrstrdatedatef64f64strstrdatestrstrstrstrstrstrdatetime[μs]i64strlist[str]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[i64]list[datetime[μs]]list[str]list[str]list[datetime[μs]]list[i64]list[str]list[str]list[bool]list[str]list[str]list[str]list[str]list[f64]list[f64]list[f64]list[bool]list[str]list[datetime[μs]]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[str]list[bool]list[str]list[str]list[str]list[str]list[str]
101116741"DOE""SIGNED""Digitizing Other Economies: A …2024-02-012029-01-311.499998e61.499998e6"HORIZON.1.1""ERC-2023-STG"2023-12-15"HORIZON""ERC-2023-STG""ERC-2023-STG""HORIZON-ERC"null"How do longstanding, primarily…2023-12-26 14:40:12259247"10.3030/101116741"nullnullnullnullnullnullnullnullnullnullnullnullnull[999981634]["NL"]["WAGENINGEN UNIVERSITY"][false]["Wageningen"]["51.9863279,5.6679366"]["http://www.wageningenur.nl/nl.htm"]["coordinator"][1.499998e6][1.499998e6][1.499998e6][false]["HES"][2023-12-26 14:40:12]nullnullnullnullnullnullnull["HORIZON.1.1"]["European Research Council (ERC)"][true]["ERC-2023-STG"]["ERC STARTING GRANTS"]["government systems"]["/social sciences/political sciences/government systems"][null]
101163161"IRASTRO""SIGNED""MOLECULAR QUANTUM DYNAMICS IN …2025-08-012031-07-311.2085363e71.2085363e7"HORIZON.1.1""ERC-2024-SyG"2025-02-20"HORIZON""ERC-2024-SyG""ERC-2024-SyG""HORIZON-ERC-SYG"null"The James Webb Space Telescope…2025-02-24 17:23:14268970"10.3030/101163161"nullnullnullnullnullnullnullnullnullnullnullnullnull[999997736, 999854855, 999990267]["DK", "DE", "DE"]["AARHUS UNIVERSITET", "UNIVERSITAET POTSDAM", "MAX-PLANCK-GESELLSCHAFT ZUR FORDERUNG DER WISSENSCHAFTEN EV"][false, false, false]["Aarhus C", "Potsdam", "Munchen"]["56.171028,10.199381", "52.3979172,13.0146272", "48.1411687,11.5822929"]["http://www.au.dk", "http://www.uni-potsdam.de", "http://www.mpg.de"]["participant", "participant", "coordinator"][4.24624e6, 1.982813e6, 5.85631e6][4.24624e6, 1.982813e6, 5.85631e6][4.24624e6, 1.982813e6, 5.85631e6][false, false, false]["HES", "HES", "REC"][2025-02-24 17:23:14, 2025-02-24 17:23:14, 2025-02-24 17:23:14]nullnullnullnullnullnullnull["HORIZON.1.1"]["European Research Council (ERC)"][true]["ERC-2024-SyG"]["ERC SYNERGY GRANTS"]["grains and oilseeds", "microscopy", … "spectroscopy"]["/agricultural sciences/agriculture, forestry, and fisheries/agriculture/grains and oilseeds", "/natural sciences/physical sciences/optics/microscopy", … "/natural sciences/physical sciences/optics/spectroscopy"][null, null, … null]
101160499"In-phase""SIGNED""Multiscale modelling of aberra…2025-06-012030-05-311.489128e61.489128e6"HORIZON.1.1""ERC-2024-STG"2025-02-19"HORIZON""ERC-2024-STG""ERC-2024-STG""HORIZON-ERC"null"The spatiotemporal organizatio…2025-02-24 17:23:11268969"10.3030/101160499"nullnullnullnullnullnullnullnullnullnullnullnullnull[999874546]["ES"]["UNIVERSIDAD COMPLUTENSE DE MADRID"][false]["Madrid"]["40.4343404,-3.7340644"]["http://www.ucm.es"]["coordinator"][1.489128e6][1.489128e6][1.489128e6][false]["HES"][2025-02-24 17:23:11]nullnullnullnullnullnullnull["HORIZON.1.1"]["European Research Council (ERC)"][true]["ERC-2024-STG"]["ERC STARTING GRANTS"]["proteins", "RNA"]["/natural sciences/biological sciences/biochemistry/biomolecules/proteins", "/natural sciences/biological sciences/genetics/RNA"][null, null]
101166905"mw-atlas""SIGNED""The first comprehensive Atlas …2025-05-012031-04-309.95756e69.95756e6"HORIZON.1.1""ERC-2024-SyG"2025-02-20"HORIZON""ERC-2024-SyG""ERC-2024-SyG""HORIZON-ERC-SYG"null"The Milky Way is the cosmic en…2025-02-24 17:23:01268971"10.3030/101166905"nullnullnullnullnullnullnullnullnullnullnullnullnull[999983962, 999995893, 999990267]["DE", "EL", "DE"]["RHEINISCH-WESTFAELISCHE TECHNISCHE HOCHSCHULE AACHEN", "IDRYMA TECHNOLOGIAS KAI EREVNAS", "MAX-PLANCK-GESELLSCHAFT ZUR FORDERUNG DER WISSENSCHAFTEN EV"][false, false, false]["Aachen", "Irakleio", "Munchen"]["50.7777954,6.07779426513182", "35.335976,25.126581", "48.1411687,11.5822929"]["http://www.rwth-aachen.de", "http://www.forth.gr", "http://www.mpg.de"]["participant", "participant", "coordinator"][2.885875e6, 3.251685e6, 3.82e6][2.885875e6, 3.251685e6, 3.82e6][2.885875e6, 3.251685e6, 3.82e6][false, false, false]["HES", "REC", "REC"][2025-02-24 17:23:01, 2025-02-24 17:23:01, 2025-02-24 17:23:01]nullnullnullnullnullnullnull["HORIZON.1.1"]["European Research Council (ERC)"][true]["ERC-2024-SyG"]["ERC SYNERGY GRANTS"]["cartography", "astrophysics", "physical cosmology"]["/natural sciences/earth and related environmental sciences/physical geography/cartography", "/natural sciences/physical sciences/astronomy/astrophysics", "/natural sciences/physical sciences/astronomy/physical cosmology"][null, null, null]
101162875"MAtCHLESS""SIGNED""Untapping multiparametric 2D l…2025-03-012030-02-281.5e61.5e6"HORIZON.1.1""ERC-2024-STG"2025-02-18"HORIZON""ERC-2024-STG""ERC-2024-STG""HORIZON-ERC"null"Cellular organisms are complex…2025-02-24 17:23:12268966"10.3030/101162875"nullnullnullnullnullnullnullnullnullnullnullnullnull[999897341, 999861354]["IT", "ES"]["UNIVERSITA CA' FOSCARI  VENEZIA", "UNIVERSIDAD AUTONOMA DE MADRID"][false, false]["Venezia", "Madrid"]["45.43458557360008,12.325949378563257", "40.4167047,-3.7035824"]["http://www.unive.it", "http://www.uam.es"]["coordinator", "participant"][1.45e6, 50000.0][1.45e6, 50000.0][1.45e6, 50000.0][false, false]["HES", "HES"][2025-02-24 17:23:12, 2025-02-24 17:23:12]nullnullnullnullnullnullnull["HORIZON.1.1"]["European Research Council (ERC)"][true]["ERC-2024-STG"]["ERC STARTING GRANTS"]["planets", "sensors", … "microbiology"]["/natural sciences/physical sciences/astronomy/planetary sciences/planets", "/engineering and technology/electrical engineering, electronic engineering, information engineering/electronic engineering/sensors", … "/natural sciences/biological sciences/microbiology"][null, null, … null]
" ], "text/plain": [ "shape: (5, 62)\n", "┌───────────┬───────────┬────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐\n", "│ id ┆ acronym ┆ status ┆ title ┆ … ┆ list_title ┆ list_euro ┆ list_euro ┆ list_desc │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ _topic ┆ SciVocTit ┆ SciVocPat ┆ ription │\n", "│ i64 ┆ str ┆ str ┆ str ┆ ┆ --- ┆ le ┆ h ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ list[str] ┆ --- ┆ --- ┆ list[str] │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ list[str] ┆ list[str] ┆ │\n", "╞═══════════╪═══════════╪════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡\n", "│ 101116741 ┆ DOE ┆ SIGNED ┆ Digitizing ┆ … ┆ [\"ERC ┆ [\"governm ┆ [\"/social ┆ [null] │\n", "│ ┆ ┆ ┆ Other ┆ ┆ STARTING ┆ ent ┆ sciences/ ┆ │\n", "│ ┆ ┆ ┆ Economies: ┆ ┆ GRANTS\"] ┆ systems\"] ┆ political ┆ │\n", "│ ┆ ┆ ┆ A … ┆ ┆ ┆ ┆ s… ┆ │\n", "│ 101163161 ┆ IRASTRO ┆ SIGNED ┆ MOLECULAR ┆ … ┆ [\"ERC ┆ [\"grains ┆ [\"/agricu ┆ [null, │\n", "│ ┆ ┆ ┆ QUANTUM ┆ ┆ SYNERGY ┆ and oilse ┆ ltural ┆ null, … │\n", "│ ┆ ┆ ┆ DYNAMICS ┆ ┆ GRANTS\"] ┆ eds\", ┆ sciences/ ┆ null] │\n", "│ ┆ ┆ ┆ IN … ┆ ┆ ┆ \"micro… ┆ agric… ┆ │\n", "│ 101160499 ┆ In-phase ┆ SIGNED ┆ Multiscale ┆ … ┆ [\"ERC ┆ [\"protein ┆ [\"/natura ┆ [null, │\n", "│ ┆ ┆ ┆ modelling ┆ ┆ STARTING ┆ s\", ┆ l science ┆ null] │\n", "│ ┆ ┆ ┆ of aberra… ┆ ┆ GRANTS\"] ┆ \"RNA\"] ┆ s/biologi ┆ │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cal… ┆ │\n", "│ 101166905 ┆ mw-atlas ┆ SIGNED ┆ The first ┆ … ┆ [\"ERC ┆ [\"cartogr ┆ [\"/natura ┆ [null, │\n", "│ ┆ ┆ ┆ comprehens ┆ ┆ SYNERGY ┆ aphy\", ┆ l science ┆ null, │\n", "│ ┆ ┆ ┆ ive Atlas ┆ ┆ GRANTS\"] ┆ \"astrophy ┆ s/earth ┆ null] │\n", "│ ┆ ┆ ┆ … ┆ ┆ ┆ sics\"… ┆ and … ┆ │\n", "│ 101162875 ┆ MAtCHLESS ┆ SIGNED ┆ Untapping ┆ … ┆ [\"ERC ┆ [\"planets ┆ [\"/natura ┆ [null, │\n", "│ ┆ ┆ ┆ multiparam ┆ ┆ STARTING ┆ \", \"senso ┆ l science ┆ null, … │\n", "│ ┆ ┆ ┆ etric 2D ┆ ┆ GRANTS\"] ┆ rs\", … ┆ s/physica ┆ null] │\n", "│ ┆ ┆ ┆ l… ┆ ┆ ┆ \"micr… ┆ l s… ┆ │\n", "└───────────┴───────────┴────────┴────────────┴───┴────────────┴───────────┴───────────┴───────────┘" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "consolidated.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting SPARQLWrapper\n", " Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)\n", "Collecting rdflib>=6.1.1 (from SPARQLWrapper)\n", " Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)\n", "Requirement already satisfied: pyparsing<4,>=2.1.0 in c:\\users\\romain\\appdata\\local\\programs\\python\\python311\\lib\\site-packages (from rdflib>=6.1.1->SPARQLWrapper) (3.0.9)\n", "Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)\n", "Downloading rdflib-7.1.4-py3-none-any.whl (565 kB)\n", " ---------------------------------------- 0.0/565.1 kB ? eta -:--:--\n", " ---------------------------------------- 565.1/565.1 kB 2.8 MB/s eta 0:00:00\n", "Installing collected packages: rdflib, SPARQLWrapper\n", "Successfully installed SPARQLWrapper-2.0.0 rdflib-7.1.4\n", "Note: you may need to restart the kernel to use updated packages.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "DEPRECATION: Loading egg at c:\\users\\romain\\appdata\\local\\programs\\python\\python311\\lib\\site-packages\\holehe-1.61-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330\n", "\n", "[notice] A new release of pip is available: 24.2 -> 25.1\n", "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] } ], "source": [ "%pip install SPARQLWrapper" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "import argparse\n", "from datetime import datetime\n", "from collections import defaultdict, Counter\n", "from typing import List, Dict, Tuple, Optional, Literal\n", "\n", "import polars as pl # pip install polars\n", "from SPARQLWrapper import SPARQLWrapper, JSON # pip install SPARQLWrapper\n", "#from langchain.docstore.document import Document # pip install langchain\n", "#from langchain.embeddings import OpenAIEmbeddings # pip install langchain openai\n", "#from langchain.vectorstores import FAISS # pip install faiss-cpu\n", "#from langchain.llms import OpenAI # pip install openai\n", "#from langchain.chains import RetrievalQA\n", "\n", "###############################################################################\n", "# Configuration #\n", "###############################################################################\n", "\n", "SPARQL_ENDPOINT = \"https://cordis.europa.eu/datalab/sparql\"\n", "CACHE_PATH = \"cache.json\" # master cache for projects core\n", "INDEX_DIR = \"faiss_horizon\" # FAISS persistence dir\n", "MAX_ROWS = 25_000 # protective cap per query\n", "\n", "PREFIXES = \"\"\"\n", "PREFIX eurio: \n", "PREFIX dct: \n", "PREFIX foaf: \n", "PREFIX schema: \n", "PREFIX skos: \n", "\"\"\"\n", "\n", "QUERIES: Dict[str, str] = {\n", " # ────────────────────────────────────────────────────────── core project\n", " \"projects\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?title ?abstract ?start ?end ?status\n", " WHERE {{\n", " ?p a eurio:Project ;\n", " dct:title ?title ;\n", " dct:abstract ?abstract ;\n", " eurio:projectID ?projectID .\n", " OPTIONAL {{ ?p eurio:startDate ?start. }}\n", " OPTIONAL {{ ?p eurio:endDate ?end. }}\n", " OPTIONAL {{ ?p eurio:projectStatus ?status. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── deliverables\n", " \"deliverable\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?deliverableType ?url ?contentUpdateDate\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:hasDeliverable ?d .\n", " OPTIONAL {{ ?d eurio:deliverableType ?deliverableType. }}\n", " OPTIONAL {{ ?d eurio:url ?url. }}\n", " OPTIONAL {{ ?d eurio:contentUpdateDate ?contentUpdateDate. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── publications\n", " \"publication\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?authors ?title ?doi ?journalTitle ?isPublishedAs ?publishedYear ?contentUpdateDate\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:producedPublication ?pub .\n", " OPTIONAL {{ ?pub dct:title ?title. }}\n", " OPTIONAL {{ ?pub eurio:authors ?authors. }}\n", " OPTIONAL {{ ?pub eurio:doi ?doi. }}\n", " OPTIONAL {{ ?pub eurio:journalTitle ?journalTitle. }}\n", " OPTIONAL {{ ?pub eurio:isPublishedAs ?isPublishedAs. }}\n", " OPTIONAL {{ ?pub eurio:publishedYear ?publishedYear. }}\n", " OPTIONAL {{ ?pub eurio:contentUpdateDate ?contentUpdateDate. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── organisation\n", " \"organisation\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?organisationID ?country ?name ?SME ?city ?geolocation ?organizationURL ?role ?ecContribution ?netEcContribution ?totalCost ?endOfParticipation ?activityType ?contentUpdateDate\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:hasParticipant ?org .\n", " OPTIONAL {{ ?org eurio:organisationID ?organisationID. }}\n", " OPTIONAL {{ ?org schema:addressCountry ?country. }}\n", " OPTIONAL {{ ?org foaf:name|schema:name ?name. }}\n", " OPTIONAL {{ ?org eurio:SME ?SME. }}\n", " OPTIONAL {{ ?org schema:addressLocality ?city. }}\n", " OPTIONAL {{ ?org eurio:hasCoordinates ?geolocation. }}\n", " OPTIONAL {{ ?org schema:url ?organizationURL. }}\n", " OPTIONAL {{ ?org eurio:role ?role. }}\n", " OPTIONAL {{ ?org eurio:ecContribution ?ecContribution. }}\n", " OPTIONAL {{ ?org eurio:netEcContribution ?netEcContribution. }}\n", " OPTIONAL {{ ?org eurio:totalCost ?totalCost. }}\n", " OPTIONAL {{ ?org eurio:endOfParticipation ?endOfParticipation. }}\n", " OPTIONAL {{ ?org eurio:activityType ?activityType. }}\n", " OPTIONAL {{ ?org eurio:contentUpdateDate ?contentUpdateDate. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── EuroSciVoc\n", " \"voc\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?euroSciVocTitle ?euroSciVocPath ?euroSciVocDescription\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:mainSubject ?v .\n", " OPTIONAL {{ ?v skos:prefLabel ?euroSciVocTitle. }}\n", " OPTIONAL {{ ?v skos:broader* ?path . ?path skos:prefLabel ?euroSciVocPath. }}\n", " OPTIONAL {{ ?v skos:definition ?euroSciVocDescription. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── topic call\n", " \"topic\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?topic ?title\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:relatedTopic ?t .\n", " OPTIONAL {{ ?t skos:notation ?topic. }}\n", " OPTIONAL {{ ?t skos:prefLabel ?title. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── web link\n", " \"weblink\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?physUrl ?availableLanguages ?status ?archivedDate ?type ?source ?represents\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:hasWebSite ?w .\n", " OPTIONAL {{ ?w eurio:physUrl ?physUrl. }}\n", " OPTIONAL {{ ?w eurio:availableLanguages ?availableLanguages. }}\n", " OPTIONAL {{ ?w eurio:status ?status. }}\n", " OPTIONAL {{ ?w eurio:archivedDate ?archivedDate. }}\n", " OPTIONAL {{ ?w eurio:type ?type. }}\n", " OPTIONAL {{ ?w eurio:source ?source. }}\n", " OPTIONAL {{ ?w eurio:represents ?represents. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "\n", " # ────────────────────────────────────────────────────────── legal basis\n", " \"legal\": f\"\"\"{PREFIXES}\n", " SELECT ?projectID ?legalBasis ?title ?uniqueProgrammePart\n", " WHERE {{\n", " ?p a eurio:Project ; eurio:projectID ?projectID ; eurio:legalBasis ?l .\n", " OPTIONAL {{ ?l dct:title ?title. }}\n", " OPTIONAL {{ ?l eurio:legalBasis ?legalBasis. }}\n", " OPTIONAL {{ ?l eurio:uniqueProgrammePart ?uniqueProgrammePart. }}\n", " }}\n", " LIMIT {MAX_ROWS}\n", " \"\"\",\n", "}\n", "\n", "###############################################################################\n", "# Generic SPARQL runner #\n", "###############################################################################\n", "\n", "def _run(query: str) -> List[Dict[str, str]]:\n", " sparql = SPARQLWrapper(SPARQL_ENDPOINT)\n", " sparql.setQuery(query)\n", " sparql.setReturnFormat(JSON)\n", " print(sparql.query().convert()[\"results\"])\n", " bindings = sparql.query().convert()[\"results\"][\"bindings\"]\n", " rows: List[Dict[str, str]] = []\n", " for b in bindings:\n", " rows.append({k: v[\"value\"] for k, v in b.items()})\n", " return rows\n", "\n", "###############################################################################\n", "# Hydration (replicating your Polars parquet workflow) #\n", "###############################################################################\n", "\n", "def hydrate_entities(out_dir: Optional[str] = None, force: bool = False) -> Dict[str, pl.DataFrame]:\n", " \"\"\"Download every QUERY in QUERIES and return {name: Polars DF}. Optionally\n", " save each to `/.parquet`.\"\"\"\n", " out: Dict[str, pl.DataFrame] = {}\n", " if out_dir:\n", " os.makedirs(out_dir, exist_ok=True)\n", "\n", " for name, query in QUERIES.items():\n", " path = None if not out_dir else os.path.join(out_dir, f\"{name}.parquet\")\n", " if path and os.path.exists(path) and not force:\n", " print(f\"[cache] {name} – reading cached parquet\")\n", " df = pl.read_parquet(path)\n", " #print(df)\n", " else:\n", " print(f\"[SPARQL] Fetching {name} …\")\n", " rows = _run(query)\n", " print(rows)\n", " df = pl.from_dicts(rows)\n", " if path:\n", " df.write_parquet(path)\n", " print(f\"[save] {name} → {path} (rows={df.shape[0]})\")\n", " out[name] = df\n", " return out\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[cache] projects – reading cached parquet\n", "[SPARQL] Fetching deliverable …\n", "{'distinct': False, 'ordered': True, 'bindings': []}\n", "[]\n" ] }, { "ename": "NoDataError", "evalue": "no data, cannot infer schema", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mNoDataError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[30], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mhydrate_entities\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mC:\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mUsers\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mRomain\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mOneDrive - KU Leuven\u001b[39;49m\u001b[38;5;124;43m\\\u001b[39;49m\u001b[38;5;124;43mMDA\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "Cell \u001b[1;32mIn[29], line 188\u001b[0m, in \u001b[0;36mhydrate_entities\u001b[1;34m(out_dir, force)\u001b[0m\n\u001b[0;32m 186\u001b[0m rows \u001b[38;5;241m=\u001b[39m _run(query)\n\u001b[0;32m 187\u001b[0m \u001b[38;5;28mprint\u001b[39m(rows)\n\u001b[1;32m--> 188\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_dicts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrows\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 189\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m path:\n\u001b[0;32m 190\u001b[0m df\u001b[38;5;241m.\u001b[39mwrite_parquet(path)\n", "File \u001b[1;32mc:\\Users\\Romain\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\polars\\convert\\general.py:209\u001b[0m, in \u001b[0;36mfrom_dicts\u001b[1;34m(data, schema, schema_overrides, strict, infer_schema_length)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (schema \u001b[38;5;129;01mor\u001b[39;00m schema_overrides):\n\u001b[0;32m 208\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mno data, cannot infer schema\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m--> 209\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m NoDataError(msg)\n\u001b[0;32m 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pl\u001b[38;5;241m.\u001b[39mDataFrame(\n\u001b[0;32m 212\u001b[0m data,\n\u001b[0;32m 213\u001b[0m schema\u001b[38;5;241m=\u001b[39mschema,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 216\u001b[0m infer_schema_length\u001b[38;5;241m=\u001b[39minfer_schema_length,\n\u001b[0;32m 217\u001b[0m )\n", "\u001b[1;31mNoDataError\u001b[0m: no data, cannot infer schema" ] } ], "source": [ "hydrate_entities(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\MDA\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'head': {'link': [], 'vars': ['project_title', 'projectID']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}\n" ] } ], "source": [ "from SPARQLWrapper import SPARQLWrapper, JSON\n", "\n", "endpoint = \"https://cordis.europa.eu/datalab/sparql\"\n", "query = \"\"\"\n", "PREFIX eurio: \n", "SELECT ?project_title ?projectID\n", "WHERE {\n", " ?project a eurio:Result .\n", " ?project eurio:title ?project_title .\n", " ?project eurio:projectID ?projectID .\n", "}\n", "ORDER BY ?project_title\n", "LIMIT 100\n", "OFFSET 1000\n", "\"\"\"\n", "\n", "sparql = SPARQLWrapper(endpoint)\n", "sparql.setQuery(query)\n", "sparql.setReturnFormat(JSON)\n", "\n", "results = sparql.query().convert()\n", "print(results)\n", "for row in results[\"results\"][\"bindings\"]:\n", " print(row[\"projectID\"][\"value\"], \"→\", row[\"title\"][\"value\"])" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 2 }