{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting pyarrow\n",
" Downloading pyarrow-19.0.1-cp310-cp310-win_amd64.whl (25.3 MB)\n",
" ---------------------------------------- 25.3/25.3 MB 2.4 MB/s eta 0:00:00\n",
"Installing collected packages: pyarrow\n",
"Successfully installed pyarrow-19.0.1\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: You are using pip version 22.0.4; however, version 25.0.1 is available.\n",
"You should consider upgrading via the 'c:\\Users\\Romain\\AppData\\Local\\Programs\\Python\\Python310\\python.exe -m pip install --upgrade pip' command.\n"
]
}
],
"source": [
"%pip install pyarrow"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import re\n",
"import polars as pl"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"import re, csv, pathlib, polars as pl\n",
"\n",
"ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n",
"DATASETS = [\n",
" \"project\",\n",
" \"projectDeliverables\",\n",
" \"projectPublications\",\n",
" \"reportSummaries\",\n",
" \"organization\",\n",
" \"euroSciVoc\",\n",
" \"topics\",\n",
" \"webItem\",\n",
" \"webLink\",\n",
" \"legalBasis\",\n",
"]\n",
"OUTDIR = ROOT / \"combined\"\n",
"OUTDIR.mkdir(exist_ok=True)\n",
"\n",
"###############################################################################\n",
"# 2. Generic cleaner –– parameterised version of the loop you wrote\n",
"###############################################################################\n",
"_PROJECT_ID_RE = re.compile(r\"^(?:19|20)\\d{2}\")\n",
"_GENERIC_NUM_RE = re.compile(r\"\\d{4}\")\n",
"\n",
"import csv, pathlib, polars as pl, re\n",
"\n",
"import csv, re, pathlib\n",
"import polars as pl # >=0.20\n",
"\n",
"import csv, pathlib, re\n",
"import polars as pl # ≥ 0.20\n",
"\n",
"\n",
"def _clean_one_file(csv_path: pathlib.Path,\n",
" number_regex: re.Pattern[str], dataset: str) -> pl.DataFrame:\n",
" \"\"\"\n",
" Clean a CORDIS CSV whose long *objective* field sometimes explodes into\n",
" extra columns because of stray quotes / semicolons.\n",
"\n",
" Strategy\n",
" --------\n",
" * A well-formed row has 21 semicolon-separated columns.\n",
" * If we get more than 21 columns we treat columns 16 … -4 as belonging\n",
" to *objective* and stitch them back together with a semicolon.\n",
" * The last three columns are contentUpdateDate | rcn | grantDoi.\n",
" \"\"\"\n",
" # ---------- constants --------------------------------------------------\n",
" if dataset==\"project\":\n",
" EXPECTED_COLS = 20 # final width\n",
" TITLE_COL = 3 # 0-based index of *title*\n",
" DATE1_COL = 4 # 0-based index of startDate\n",
" DATE2_COL = 5 # 0-based index of endDate\n",
" OBJECTIVE_COL = 16 # 0-based index of objective\n",
" TRAILING_KEEP = 3 # last three fixed columns\n",
" elif dataset==\"organization\":\n",
" EXPECTED_COLS = 25 # final width\n",
" TITLE_COL = 3 # 0-based index of *title*\n",
" DATE1_COL = 4 # 0-based index of startDate\n",
" DATE2_COL = 5 # 0-based index of endDate\n",
" OBJECTIVE_COL = 4 # 0-based index of objective\n",
" TRAILING_KEEP = 20 # last three fixed columns\n",
" else:\n",
" EXPECTED_COLS = 20 # final width\n",
" TITLE_COL = 3 # 0-based index of *title*\n",
" DATE1_COL = 4 # 0-based index of startDate\n",
" DATE2_COL = 5 # 0-based index of endDate\n",
" OBJECTIVE_COL = 16 # 0-based index of objective\n",
" TRAILING_KEEP = 3 # last three fixed columns\n",
"\n",
"\n",
"\n",
" date_rx = re.compile(r\"\\d{4}-\\d{2}-\\d{2}$\")\n",
" is_date = lambda s: (s == \"\") or bool(date_rx.match(s))\n",
"\n",
" tmp_clean = csv_path.with_suffix(\".cleaned.csv\")\n",
"\n",
" with csv_path.open(encoding=\"utf-8\", newline=\"\") as fin, \\\n",
" tmp_clean.open(\"w\", encoding=\"utf-8\", newline=\"\") as fout:\n",
"\n",
" writer = csv.writer(\n",
" fout,\n",
" delimiter=\"|\",\n",
" quotechar='\"',\n",
" quoting=csv.QUOTE_MINIMAL,\n",
" lineterminator=\"\\n\",\n",
" )\n",
"\n",
" # ---------- iterate raw lines -------------------------------------\n",
" for raw in fin:\n",
" #print(raw)\n",
" raw = raw.rstrip(\"\\n\")\n",
" #print(raw)\n",
" cells = raw.split(\";\") # blind split\n",
"\n",
" # ---- 1️⃣ repair *title* if dates are not where they belong --\n",
" if (len(cells) > EXPECTED_COLS) and (not is_date(cells[DATE1_COL]) or not is_date(cells[DATE2_COL])) and dataset==\"project\":\n",
" # look for the first position where *two successive* cells\n",
" # are both valid dates / nulls\n",
" i = DATE1_COL\n",
" while i + 1 < len(cells):\n",
" if is_date(cells[i]) and is_date(cells[i + 1]):\n",
" break\n",
" i += 1\n",
" else:\n",
" # cannot find a valid date pair → give up on this line\n",
" continue\n",
"\n",
" head = cells[:TITLE_COL] # 0 … 2\n",
" title = \";\".join(cells[TITLE_COL:i]) # glue spill-over\n",
" cells = head + [title] + cells[i:] # rebuild the row\n",
" # ---- 2️⃣ repair *objective* overflow ------------------------\n",
" if len(cells) > EXPECTED_COLS and (dataset==\"project\" or dataset==\"organization\"):\n",
" head = cells[:OBJECTIVE_COL]\n",
" tail = cells[-TRAILING_KEEP:]\n",
" obj = \";\".join(cells[OBJECTIVE_COL:-TRAILING_KEEP])\n",
" cells = head + [obj] + tail\n",
" #print(\"here 2\")\n",
"\n",
" # ---- 3️⃣ pad short rows, skip malformed ---------------------\n",
" if len(cells) < EXPECTED_COLS and (dataset==\"project\" or dataset==\"organization\"):\n",
" cells.extend([\"\"] * (EXPECTED_COLS - len(cells)))\n",
" #print(\"here again\")\n",
"\n",
" if len(cells) != EXPECTED_COLS and (dataset==\"project\" or dataset==\"organization\"): # still wrong → skip\n",
" #print(cells)\n",
" continue\n",
"\n",
" # ---- 4️⃣ cell-level clean-ups -------------------------------\n",
" cleaned: list[str] = []\n",
" for cell in cells:\n",
"\n",
" if cell in ('\"\"', \"\"):\n",
" cell = \"\"\n",
" else:\n",
" cell = (cell.replace(\"\\t\", \" \")\n",
" .replace('\"\"\"', '\"')\n",
" .strip())\n",
" if number_regex.fullmatch(cell):\n",
" cell = cell.lstrip(\"0\") or \"0\"\n",
" cleaned.append(cell.strip('\"'))\n",
" cleaned[-1]=cleaned[-1].replace('\"','').replace(',','')\n",
" cleaned[0]=cleaned[0].replace('\"','')\n",
" writer.writerow(cleaned)\n",
"\n",
" # ---------- read into Polars (all Utf8) -------------------------------\n",
" return pl.read_csv(\n",
" tmp_clean,\n",
" separator=\"|\",\n",
" quote_char='\"',\n",
" has_header=True,\n",
" infer_schema_length=0,\n",
" null_values=[\"\"],\n",
" truncate_ragged_lines=True,\n",
" )\n",
"\n",
"\n",
"def combine_all_programmes() -> None:\n",
" from pathlib import Path\n",
" for dataset in DATASETS:\n",
" combined: list[pl.DataFrame] = []\n",
"\n",
" for i,programme_dir in enumerate(ROOT.iterdir()):\n",
" if not programme_dir.is_dir():\n",
" continue\n",
" csv_file = programme_dir / f\"{dataset}.csv\"\n",
" if not csv_file.exists():\n",
" continue\n",
"\n",
" regex = _PROJECT_ID_RE if dataset == \"project\" else _GENERIC_NUM_RE\n",
" df = _clean_one_file(csv_file, regex, dataset)\n",
" print(programme_dir)\n",
" # ---------- type coercions matching your original code ----------\n",
" if dataset == \"project\":\n",
" df = (\n",
" df\n",
" .with_columns([\n",
" pl.col(\"id\"),#.cast(pl.Int64),\n",
" pl.col(\"acronym\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"status\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"title\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"legalBasis\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"topics\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"frameworkProgramme\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"masterCall\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"subCall\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"fundingScheme\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"nature\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"objective\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"grantDoi\").cast(pl.Utf8, strict=False).str.strip_chars('\"'),\n",
" pl.col(\"totalCost\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.replace_all('\"','').str.replace(\",\",\".\").cast(pl.Float64),\n",
" pl.col(\"ecMaxContribution\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.replace_all('\"','').str.replace(\",\",\".\").cast(pl.Float64),\n",
" pl.col(\"startDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Date, \"%Y-%m-%d\", strict=False),\n",
" pl.col(\"endDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Date, \"%Y-%m-%d\", strict=False),\n",
" pl.col(\"ecSignatureDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Date, \"%Y-%m-%d\", strict=False),\n",
" pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False).str.strip_chars('\"').str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n",
" pl.col(\"rcn\").cast(pl.Int64),\n",
" ])\n",
" .with_columns(\n",
" pl.lit(programme_dir.name).alias(\"programmeFolder\") # <-- NEW COLUMN\n",
" )\n",
" )\n",
" elif dataset == \"organization\":\n",
" df = df.with_columns([\n",
" pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False).str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n",
" pl.col(\"totalCost\").cast(pl.Utf8, strict=False).str.replace(\",\",\".\").cast(pl.Float64),\n",
" ])\n",
" elif dataset == \"projectDeliverables\":\n",
" df = df.with_columns([\n",
" #pl.col(\"projectID\").cast(pl.Int64),\n",
" pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n",
" .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n",
" ])\n",
" elif dataset == \"projectPublications\":\n",
" if programme_dir==Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\"):\n",
" rename_map = {\n",
" \"RECORD_ID\": \"id\",\n",
" \"TITLE\": \"title\",\n",
" \"AUTHOR\": \"authors\",\n",
" \"DOI\": \"doi\",\n",
" \"PROJECT_ID\": \"projectID\",\n",
" \"JOURNAL_TITLE\": \"journalTitle\",\n",
" \"PAGES\": \"publishedPages\",\n",
" \"PUBLICATION_TYPE\": \"isPublishedAs\",\n",
" }\n",
"\n",
" df = df.rename(rename_map)\n",
" else:\n",
" df = df.with_columns([\n",
" pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n",
" .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n",
" pl.col(\"id\").cast(pl.Utf8, strict=False)\n",
" .str.extract(r\"^(\\d+)_\", 1)\n",
" #.cast(pl.Int64)\n",
" .alias(\"projectID\"),\n",
" ])\n",
" elif dataset == \"reportSummaries\":\n",
" df = df.with_columns(\n",
" pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n",
" .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n",
" )\n",
" elif dataset == \"organization\":\n",
" df = df.with_columns([\n",
" pl.col(\"contentUpdateDate\").cast(pl.Utf8, strict=False)\n",
" .str.strptime(pl.Datetime, \"%Y-%m-%d %H:%M:%S\", strict=False),\n",
" pl.col(\"totalCost\").cast(pl.Utf8, strict=False)\n",
" .str.replace(\",\", \".\")\n",
" .cast(pl.Float64),\n",
" ])\n",
" elif dataset == \"webItem\":\n",
" df = df.with_columns(\n",
" pl.col(\"uri\").cast(pl.Utf8, strict=False)\n",
" .str.extract(r\"/files/\\d+/(\\d+)/\", 1)\n",
" .cast(pl.Int64)\n",
" .alias(\"projectID\"),\n",
" )\n",
"\n",
" # ---------------------------------------------------------------\n",
" combined.append(df)\n",
"\n",
" # --------------------------------------------------------------------\n",
" # Write out per-dataset parquet\n",
" # --------------------------------------------------------------------\n",
" if combined:\n",
" how=\"vertical_relaxed\"\n",
" if dataset==\"projectPublications\":\n",
" how=\"diagonal\"\n",
" result = pl.concat(combined, how=how)\n",
" parquet_path = OUTDIR / f\"{dataset}_all.parquet\"\n",
" result.write_parquet(parquet_path)\n",
" print(f\"✔ {dataset:15s} → {parquet_path}\")"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ project → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\project_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ projectDeliverables → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\projectDeliverables_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ projectPublications → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\projectPublications_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ reportSummaries → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\reportSummaries_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ organization → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\organization_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ euroSciVoc → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\euroSciVoc_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ topics → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\topics_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ webItem → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\webItem_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ webLink → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\webLink_all.parquet\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1984\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1987\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1990\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1994\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H1998\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2006\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2013\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2020\n",
"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\H2027\n",
"✔ legalBasis → C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\\combined\\legalBasis_all.parquet\n"
]
}
],
"source": [
"combine_all_programmes()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Consolidate projects"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import polars as pl\n",
"\n",
"ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n",
"OUTDIR = ROOT / \"combined\"\n",
"DATASETS = [\n",
" \"project\",\n",
" \"projectDeliverables\",\n",
" \"projectPublications\",\n",
" \"reportSummaries\",\n",
" \"organization\",\n",
" \"euroSciVoc\",\n",
" \"topics\",\n",
" \"webItem\",\n",
" \"webLink\",\n",
" \"legalBasis\",\n",
"]\n",
"\n",
"dfs = {}\n",
"for dataset in DATASETS:\n",
" path = OUTDIR / f\"{dataset}_all.parquet\"\n",
" dfs[dataset] = pl.read_parquet(path)\n",
"\n",
"projects = dfs[\"project\"]\n",
"\n",
"projects_deliv = (\n",
" dfs[\"projectDeliverables\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"deliverableType\").alias(\"list_deliverableType\"),\n",
" pl.col(\"url\") .alias(\"list_url\"),\n",
" pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n",
" ])\n",
")\n",
"\n",
"projects_publi = (\n",
" dfs[\"projectPublications\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"authors\") .alias(\"list_authors\"),\n",
" pl.col(\"title\") .alias(\"list_title\"),\n",
" pl.col(\"doi\") .alias(\"list_doi\"),\n",
" pl.col(\"journalTitle\") .alias(\"list_journalTitle\"),\n",
" pl.col(\"isPublishedAs\") .alias(\"list_isPublishedAs\"),\n",
" pl.col(\"publishedYear\") .alias(\"list_publishedYear\"),\n",
" pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n",
" ])\n",
")\n",
"\n",
"report = (\n",
" dfs[\"reportSummaries\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"title\") .alias(\"list_title\"),\n",
" pl.col(\"attachment\") .alias(\"list_attachment\"),\n",
" pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n",
" ])\n",
")\n",
"\n",
"org = (\n",
" dfs[\"organization\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"organisationID\") .alias(\"list_organisationID\"),\n",
" pl.col(\"country\") .alias(\"list_country\"),\n",
" pl.col(\"name\") .alias(\"list_name\"),\n",
" pl.col(\"SME\") .alias(\"list_SME\"),\n",
" pl.col(\"city\") .alias(\"list_city\"),\n",
" pl.col(\"geolocation\") .alias(\"list_geolocation\"),\n",
" pl.col(\"organizationURL\") .alias(\"list_organizationURL\"),\n",
" pl.col(\"role\") .alias(\"list_role\"),\n",
" pl.col(\"ecContribution\") .alias(\"list_ecContribution\"),\n",
" pl.col(\"netEcContribution\").alias(\"list_netEcContribution\"),\n",
" pl.col(\"totalCost\") .alias(\"list_totalCost\"),\n",
" pl.col(\"endOfParticipation\").alias(\"list_endOfParticipation\"),\n",
" pl.col(\"activityType\") .alias(\"list_activityType\"),\n",
" pl.col(\"contentUpdateDate\").alias(\"list_contentUpdateDate\"),\n",
" ])\n",
")\n",
"\n",
"voc = (\n",
" dfs[\"euroSciVoc\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"euroSciVocTitle\") .alias(\"list_euroSciVocTitle\"),\n",
" pl.col(\"euroSciVocPath\") .alias(\"list_euroSciVocPath\"),\n",
" pl.col(\"euroSciVocDescription\").alias(\"list_description\"),\n",
" ])\n",
")\n",
"\n",
"topic = (\n",
" dfs[\"topics\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"topic\") .alias(\"list_topic\"),\n",
" pl.col(\"title\") .alias(\"list_title\"),\n",
" ])\n",
")\n",
"\n",
"web_item = dfs[\"webItem\"] # no aggregation\n",
"\n",
"web_link = (\n",
" dfs[\"webLink\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"physUrl\") .alias(\"list_physUrl\"),\n",
" pl.col(\"availableLanguages\") .alias(\"list_availableLanguages\"),\n",
" pl.col(\"status\") .alias(\"list_status\"),\n",
" pl.col(\"archivedDate\") .alias(\"list_archivedDate\"),\n",
" pl.col(\"type\") .alias(\"list_type\"),\n",
" pl.col(\"source\") .alias(\"list_source\"),\n",
" pl.col(\"represents\") .alias(\"list_represents\"),\n",
" ])\n",
")\n",
"\n",
"legal = (\n",
" dfs[\"legalBasis\"]\n",
" .group_by(\"projectID\")\n",
" .agg([\n",
" pl.col(\"legalBasis\") .alias(\"list_legalBasis\"),\n",
" pl.col(\"title\") .alias(\"list_title\"),\n",
" pl.col(\"uniqueProgrammePart\").alias(\"list_uniqueProgrammePart\"),\n",
" ])\n",
")\n",
"\n",
"consolidated = (\n",
" projects\n",
" .join(projects_deliv, left_on=\"id\", right_on=\"projectID\", suffix=\"_deliv\", how=\"left\")\n",
" .join(projects_publi, left_on=\"id\", right_on=\"projectID\", suffix=\"_publi\", how=\"left\")\n",
" .join(report, left_on=\"id\", right_on=\"projectID\", suffix=\"_report\", how=\"left\")\n",
" .join(org, left_on=\"id\", right_on=\"projectID\", suffix=\"_org\", how=\"left\")\n",
" .join(web_link, left_on=\"id\", right_on=\"projectID\", suffix=\"_link\", how=\"left\")\n",
" .join(legal, left_on=\"id\", right_on=\"projectID\", suffix=\"_legal\", how=\"left\")\n",
" .join(topic, left_on=\"id\", right_on=\"projectID\", suffix=\"_topic\", how=\"left\")\n",
" .join(voc, left_on=\"id\", right_on=\"projectID\", suffix=\"_voc\", how=\"left\")\n",
")\n",
"\n",
"for col in [\"startDate\", \"endDate\"]:\n",
" if consolidated[col].dtype == pl.Utf8:\n",
" consolidated = consolidated.with_column(\n",
" pl.col(col).str.strptime(pl.Date, \"%Y-%m-%d\").alias(col)\n",
" )\n",
"\n",
"consolidated = consolidated.with_columns(\n",
" pl.col(\"list_netEcContribution\").list.eval(pl.element().cast(pl.Float64),parallel=True)\n",
" .list.sum().alias(\"netEcContribution\")\n",
")\n",
"\n",
"consolidated = consolidated.with_columns(\n",
" pl.col(\"totalCost\").cast(pl.Float64),\n",
" pl.col(\"netEcContribution\").cast(pl.Float64)\n",
")\n",
"\n",
"consolidated = consolidated.with_columns([\n",
" pl.col(\"startDate\").dt.year().alias(\"startYear\"),\n",
" pl.col(\"endDate\"). dt.year().alias(\"endYear\"),\n",
" (pl.col(\"endDate\") - pl.col(\"startDate\")).dt.total_days().alias(\"durationDays\"),\n",
" (pl.col(\"netEcContribution\") / pl.col(\"totalCost\")).alias(\"ecRatio\"),\n",
"])\n",
"\n",
"consolidated.write_parquet(OUTDIR / \"consolidated.parquet\")\n",
"\n",
"excluded_frameworks = [\"FP1\", \"FP2\", \"FP3\", \"FP4\", \"FP5\", \"FP6\"]\n",
"\n",
"consolidated_clean = (consolidated.filter(~pl.col(\"frameworkProgramme\").is_in(excluded_frameworks)))\n",
"\n",
"consolidated_clean.write_parquet(OUTDIR / \"consolidated_clean.parquet\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import pathlib\n",
"ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\MDA\\backend\\data\")\n",
"OUTDIR = ROOT #/ \"combined\"\n",
"\n",
"#consolidated = pl.read_parquet(OUTDIR / \"consolidated.parquet\")\n",
"consolidated_clean = pl.read_parquet(OUTDIR / \"consolidated_clean.parquet\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"shape: (5, 68)
id
acronym
status
title
startDate
endDate
totalCost
ecMaxContribution
legalBasis
topics
ecSignatureDate
frameworkProgramme
masterCall
subCall
fundingScheme
nature
objective
contentUpdateDate
rcn
grantDoi
programmeFolder
list_deliverableType
list_url
list_contentUpdateDate
list_authors
list_title
list_doi
list_journalTitle
list_isPublishedAs
list_publishedYear
list_contentUpdateDate_publi
list_title_report
list_attachment
list_contentUpdateDate_report
list_organisationID
list_country
list_name
list_SME
list_city
list_geolocation
list_organizationURL
list_role
list_ecContribution
list_netEcContribution
list_totalCost
list_endOfParticipation
list_activityType
list_contentUpdateDate_org
list_physUrl
list_availableLanguages
list_status
list_archivedDate
list_type
list_source
list_represents
list_legalBasis
list_title_legal
list_uniqueProgrammePart
list_topic
list_title_topic
list_euroSciVocTitle
list_euroSciVocPath
list_description
netEcContribution
startYear
endYear
durationDays
ecRatio
str
str
str
str
date
date
f64
f64
str
str
date
str
str
str
str
str
str
datetime[μs]
i64
str
str
list[str]
list[str]
list[datetime[μs]]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[datetime[μs]]
list[str]
list[str]
list[datetime[μs]]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[f64]
list[str]
list[str]
list[datetime[μs]]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
list[str]
f64
i32
i32
i64
f64
"624794"
"COMPACTABILITY"
"CLOSED"
"Contribution of Compact Neighb…
2014-12-01
2016-11-30
309235.2
309235.2
"FP7-PEOPLE"
"FP7-PEOPLE-2013-IEF"
null
"FP7"
null
"FP7-PEOPLE-2013-IEF"
"MC-IEF"
null
"This research investigates how…
2017-04-10 11:25:29
187874
null
"H2013"
null
null
null
null
null
null
null
null
null
null
["Final Report Summary - COMPACTABILITY (Contribution of Compact Neighbourhoods to Social Sustainability)"]
["/docs/results/624/624794/final1-table-1.jpg"]
[2017-03-07 17:25:15]
["999446873"]
["UK"]
["OXFORD BROOKES UNIVERSITY"]
[null]
["Oxford"]
["51.7520131,-1.2578498"]
["http://www.brookes.ac.uk"]
["coordinator"]
["309235.2"]
[null]
[null]
["false"]
["HES"]
[2017-04-10 11:25:29]
null
null
null
null
null
null
null
["FP7-PEOPLE"]
["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]
[null]
["FP7-PEOPLE-2013-IEF"]
["Marie-Curie Action: Intra-European fellowships for career development"]
null
null
null
0.0
2014
2016
730
0.0
"276810"
"ARCHOSL"
"CLOSED"
"Archives of Early Human Occupa…
2011-03-01
2014-02-28
75000.0
75000.0
"FP7-PEOPLE"
"FP7-PEOPLE-2009-RG"
null
"FP7"
null
"FP7-PEOPLE-2010-RG"
"MC-IRG"
null
"A number of important archaeol…
2019-08-02 13:24:51
98178
null
"H2013"
null
null
null
["Arnold, L.J., Demuro, M., Parés, J.M., Arsuaga, J.L., Aranburu, A.,", "Lee J. Arnold , Martina Demuro , Marta Navazo , Alfonso Benito-Calvo , Alfredo Pérez-González", … "F. Gutiérrez , B. Valero-Garcés , G. Desir , P. González-Sampériz , M. Gutiérrez , R. Linares , M. Zarroca , A. Moreno , J. Guerrero , C. Roqué"]
["Luminescence dating and palaeomagnetic age constraint on hominins from Sima de los Huesos, Atapuerca, Spain", "OSL dating of the Middle Palaeolithic Hotel California site, Sierra de Atapuerca, north-central Spain", … "Late Holocene evolution of playa lakes in the central Ebro depression based on geophysical surveys and morpho-stratigraphic analysis of lacustrine terraces"]
["Final Report Summary - ARCHOSL (Archives of Early Human Occupation in Western Europe: OSL Chronologies beyond the Middle Pleistocene in the Iberian Peninsula)"]
[null]
[2014-11-07 13:26:06]
["986579241"]
["ES"]
["CENTRO NACIONAL DE INVESTIGACION SOBRE LA EVOLUCION HUMANA"]
[null]
["Burgos"]
["42.3396185,-3.6967044"]
["http://www.cenieh.es"]
["coordinator"]
["75000"]
[null]
[null]
["false"]
["REC"]
[2019-08-02 13:24:51]
null
null
null
null
null
null
null
["FP7-PEOPLE"]
["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]
[null]
["FP7-PEOPLE-2009-RG"]
["Marie Curie Action: Reintegration Grants"]
["ethnoarchaeology", "physical anthropology"]
["/humanities/history and archaeology/archaeology/ethnoarchaeology", "/social sciences/sociology/anthropology/physical anthropology"]
[null, null]
0.0
2011
2014
1095
0.0
"622478"
"DETforDRF 2.0"
"CLOSED"
"Design and Expansion Turbine f…
null
null
161968.8
161968.8
"FP7-PEOPLE"
"FP7-PEOPLE-2013-IEF"
null
"FP7"
null
"FP7-PEOPLE-2013-IEF"
"MC-IEF"
null
"This proposal for a Marie Curi…
2016-03-31 21:10:31
187686
null
"H2013"
null
null
null
null
null
null
null
null
null
null
null
null
null
["953573536"]
["DE"]
["BSH HAUSGERATE GMBH"]
[null]
["Munchen"]
["48.0887063,11.6433468"]
["http://www.bsh-group.com"]
["coordinator"]
["161968.8"]
[null]
[null]
["false"]
["PRC"]
[2016-03-31 21:10:31]
null
null
null
null
null
null
null
["FP7-PEOPLE"]
["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]
[null]
["FP7-PEOPLE-2013-IEF"]
["Marie-Curie Action: Intra-European fellowships for career development"]
["Rebekah Plueckhahn", "Dulam, Bumochir", … "•Empson, R. A."]
["Tragic Spirits: Shamanism, Memory, and Gender in Contemporary Mongolia by Manduhai Buyandelger.", "The Afterlife of Nomadism: Pastoralism, environmentalism, civilization and identity in Mongolia and China", … "A Space That Will Never Be Filled Sharp Communication and the Simultaneity of Opposites."]
["10.1111/aman.12304", null, … null]
["American Anthropologist", "Pastoralist Livelihoods in Asian Drylands: Environment, Governance and Risk", … "Current Anthropology"]
["PEER_REVIEWED_ARTICLE", "ARTICLE", … "ARTICLE"]
[null, null, … null]
[null, null, … null]
["Final Report Summary - EMERGING SUBJECTS (Emerging Subjects of the New Economy: Tracing Economic Growth in Mongolia)"]
[null]
[2018-01-15 17:25:25]
["888898146"]
[null]
["UNIVERSITY COLLEGE LONDON"]
[null]
["LONDON"]
["51.5236746,-0.1339608"]
["http://www.ucl.ac.uk"]
["coordinator"]
["1658373"]
[null]
[null]
["false"]
["HES"]
[2023-04-05 11:40:06]
null
null
null
null
null
null
null
["FP7-IDEAS-ERC"]
["Specific programme: "Ideas" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]
[null]
["ERC-CG-2013-SH2"]
["ERC Consolidator Grant - Institutions Values Beliefs and behaviour"]
["anthropology"]
["/social sciences/sociology/anthropology"]
[null]
0.0
2014
2019
1763
0.0
"237010"
"DEER PALAEOBIOLOGY"
"CLOSED"
"Palaeobiological inference thr…
2009-04-09
2011-01-08
173416.47
173416.47
"FP7-PEOPLE"
"FP7-PEOPLE-IEF-2008"
null
"FP7"
null
"FP7-PEOPLE-IEF-2008"
"MC-IEF"
null
"The present research aims to r…
2019-07-16 19:18:25
90424
null
"H2013"
null
null
null
["Lister, A.M., Breda, M. and others", "Breda, M., Lister, A.M. & others"]
["Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. II. Cervidae, Equidae and Suidae.", "Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. I: Rhinocerotidae and Bovidae."]
["Final Report Summary - DEER PALAEOBIOLOGY (Palaeobiological inference through phylogenetic analysis of Pleistocene deer)"]
[null]
[2013-07-05 00:02:53]
["999642037"]
["UK"]
["NATURAL HISTORY MUSEUM"]
[null]
["London"]
["51.494882,-0.1847716"]
["http://www.nhm.ac.uk/"]
["coordinator"]
["173416.47"]
[null]
[null]
["false"]
["PUB"]
[2019-07-16 19:18:25]
null
null
null
null
null
null
null
["FP7-PEOPLE"]
["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]
[null]
["FP7-PEOPLE-IEF-2008"]
["Marie Curie Action: Intra-European Fellowships for Career Development"]
["European programme (EEC) for research and development in information technologies (ESPRIT), 1984-1988"]
[null]
null
null
["software", "sensors"]
["/natural sciences/computer and information sciences/software", "/engineering and technology/electrical engineering, electronic engineering, information engineering/electronic engineering/sensors"]
[null, null]
0.0
1985
1988
1095
null
"395"
"INCA"
null
"An Integrated Network Architec…
1984-09-01
1989-09-01
null
null
"FP1-ESPRIT 1"
null
null
"FP1"
null
null
null
null
"The principal purpose of the I…
1992-12-09 00:00:02
8633
null
"H1984"
null
null
null
null
null
null
null
null
null
null
null
null
null
[null, null, … null]
["IT", "IT", … "UK"]
["Ingegneria C. Olivetti and C. SpA", "System Wizards Srl", … "Birkbeck College, University of London"]
["European programme (EEC) for research and development in information technologies (ESPRIT), 1984-1988"]
[null]
null
null
["software", "telecommunications"]
["/natural sciences/computer and information sciences/software", "/engineering and technology/electrical engineering, electronic engineering, information engineering/information engineering/telecommunications"]
[null, null]
0.0
1984
1989
1826
null
"EN3M0096"
null
null
"IMPROVEMENT AND APPLICATION OF…
1988-07-01
1989-02-28
null
null
"FP1-ENNONUC 3C"
null
null
"FP1"
null
null
"CSC"
null
"THERE WILL BE NUMEROUS SCIENTI…
1994-01-23 00:00:02
12659
null
"H1984"
null
null
null
null
null
null
null
null
null
null
null
null
null
[null]
["FR"]
["Chambre de Commerce et d'Industrie de Paris (CCIP)"]
[null]
["Paris"]
["46.6769224,-1.4244831"]
[null]
["coordinator"]
[null]
[null]
[null]
[null]
[null]
[1994-01-23 00:00:02]
null
null
null
null
null
null
null
["FP1-ENNONUC 3C"]
["Research and development programme (EEC) in the field of Non-Nuclear Energy, 1985-1988"]
[null]
null
null
["software"]
["/natural sciences/computer and information sciences/software"]
["European programme (EEC) for research and development in information technologies (ESPRIT), 1984-1988"]
[null]
null
null
["software"]
["/natural sciences/computer and information sciences/software"]
[null]
0.0
1985
1986
365
null
"EV4T0018"
null
null
"ANALYSIS OF OPTICAL AND THERMA…
1987-12-01
1990-11-30
null
null
"FP1-TECHHAZ C"
null
null
"FP1"
null
null
"CSC"
null
"DEVELOPMENT OF APPROPRIATE DIG…
1994-01-23 00:00:02
11996
null
"H1984"
null
null
null
null
null
null
null
null
null
null
null
null
null
[null]
["BE"]
["VON KARMAN INSTITUTE FOR FLUID DYNAMICS"]
[null]
["RHODE-ST-GENESE"]
["50.7562383,4.3873549"]
[null]
["coordinator"]
[null]
[null]
[null]
[null]
[null]
[1994-01-23 00:00:02]
null
null
null
null
null
null
null
["FP1-TECHHAZ C"]
["Multiannual R&D programmes (EEC) in the field of the environment - Pilot projects on major technological hazards -, 1986-1990"]
[null]
null
null
["graphic design", "software", … "laser physics"]
["/social sciences/media and communications/graphic design", "/natural sciences/computer and information sciences/software", … "/natural sciences/physical sciences/optics/laser physics"]
[null, null, … null]
0.0
1987
1990
1095
null
"
],
"text/plain": [
"shape: (5, 68)\n",
"┌──────────┬───────────┬────────┬───────────────┬───┬───────────┬─────────┬──────────────┬─────────┐\n",
"│ id ┆ acronym ┆ status ┆ title ┆ … ┆ startYear ┆ endYear ┆ durationDays ┆ ecRatio │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str ┆ str ┆ ┆ i32 ┆ i32 ┆ i64 ┆ f64 │\n",
"╞══════════╪═══════════╪════════╪═══════════════╪═══╪═══════════╪═════════╪══════════════╪═════════╡\n",
"│ 1476 ┆ null ┆ null ┆ Integrated ┆ … ┆ 1985 ┆ 1988 ┆ 1095 ┆ null │\n",
"│ ┆ ┆ ┆ Sensor-Based ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ ┆ ┆ Robot … ┆ ┆ ┆ ┆ ┆ │\n",
"│ 395 ┆ INCA ┆ null ┆ An Integrated ┆ … ┆ 1984 ┆ 1989 ┆ 1826 ┆ null │\n",
"│ ┆ ┆ ┆ Network ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ ┆ ┆ Architec… ┆ ┆ ┆ ┆ ┆ │\n",
"│ EN3M0096 ┆ null ┆ null ┆ IMPROVEMENT ┆ … ┆ 1988 ┆ 1989 ┆ 242 ┆ null │\n",
"│ ┆ ┆ ┆ AND ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ ┆ ┆ APPLICATION ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ ┆ ┆ OF… ┆ ┆ ┆ ┆ ┆ │\n",
"│ 874 ┆ CONCORDIA ┆ null ┆ Integrated ┆ … ┆ 1985 ┆ 1986 ┆ 365 ┆ null │\n",
"│ ┆ ┆ ┆ Environment ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ ┆ ┆ for Rel… ┆ ┆ ┆ ┆ ┆ │\n",
"│ EV4T0018 ┆ null ┆ null ┆ ANALYSIS OF ┆ … ┆ 1987 ┆ 1990 ┆ 1095 ┆ null │\n",
"│ ┆ ┆ ┆ OPTICAL AND ┆ ┆ ┆ ┆ ┆ │\n",
"│ ┆ ┆ ┆ THERMA… ┆ ┆ ┆ ┆ ┆ │\n",
"└──────────┴───────────┴────────┴───────────────┴───┴───────────┴─────────┴──────────────┴─────────┘"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"consolidated.head()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['en' 'de' 'it' 'sw' 'ro' 'pl' 'pt' 'ca' 'af' 'es' 'nl' 'no' 'da' 'vi'\n",
" 'et' 'fr' 'cy' 'tl' 'so' 'sv' 'tr' 'id' 'lt' 'hu' 'fi' 'hr' 'sl'\n",
" 'unknown' 'sk' 'cs']\n",
"title\n",
"en 56676\n",
"de 1012\n",
"it 343\n",
"ro 238\n",
"ca 224\n",
"fr 144\n",
"da 131\n",
"es 81\n",
"tl 69\n",
"vi 69\n",
"nl 61\n",
"pt 60\n",
"af 46\n",
"no 43\n",
"id 36\n",
"so 29\n",
"sv 24\n",
"pl 15\n",
"cy 14\n",
"et 11\n",
"fi 10\n",
"sw 9\n",
"hr 6\n",
"sl 5\n",
"lt 3\n",
"tr 2\n",
"unknown 2\n",
"hu 1\n",
"sk 1\n",
"cs 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"from langdetect import detect\n",
"\n",
"def is_english(text):\n",
" try:\n",
" return detect(text) == 'en'\n",
" except:\n",
" return False\n",
"\n",
"def lang_text(text):\n",
" try:\n",
" return detect(text)\n",
" except:\n",
" return \"unknown\"\n",
"\n",
"languages=consolidated_clean.filter(pl.col('status').is_not_null()).to_pandas()['title'].apply(lang_text)\n",
"unique_languages = languages.unique()\n",
"print(unique_languages)\n",
"language_counts = languages.value_counts()\n",
"print(language_counts)\n",
"\n",
"#print(\"English coverage:\", consolidated.to_pandas()['title'].apply(is_english).mean())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"🧹 Preparing data...\n",
"💡 Embedding text...\n",
"Loading saved embeddings for column 'title'...\n",
"Fitting SVD for column 'title'...\n",
"Loading saved embeddings for column 'objective'...\n",
"Fitting SVD for column 'objective'...\n",
"Loading saved embeddings for column 'topic_title'...\n",
"Fitting SVD for column 'topic_title'...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[I 2025-05-12 21:30:14,899] A new study created in memory with name: no-name-7695811c-115a-4d9f-a17b-3fc16faf602a\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"🧱 Building pipeline...\n",
"🎯 Training model with Optuna...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BB', 'BH', 'BN', 'CF', 'GF', 'GT', 'HN', 'KW', 'LR', 'MR', 'OM', 'QA', 'SL', 'SM', 'ST', 'VG'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BUILDING B'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['66C27239DA0384FFDD662CB8A543DBAB', 'AD460A03C5F2CBDC967586AF7495D308'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['ADMIRALTY LAW', 'BIOLOGICAL BEHAVIOURAL SCIENCES', 'BREAST IMPLANTS', 'ETHICAL THEORIES', 'INERTIAL NAVIGATION SYSTEM', 'NUCLEAR CHEMISTRY', 'OTHER AGRICULTURAL SCIENCES', 'OTHER MEDICAL SCIENCES'] will be ignored\n",
" warnings.warn(\n",
"[I 2025-05-12 21:35:06,892] Trial 0 finished with value: 0.42635007849963164 and parameters: {'n_estimators': 151, 'max_depth': 5, 'learning_rate': 0.27984807723222593, 'scale_pos_weight': 6.072340577651307}. Best is trial 0 with value: 0.42635007849963164.\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BB', 'BH', 'BN', 'CF', 'GF', 'GT', 'HN', 'KW', 'LR', 'MR', 'OM', 'QA', 'SL', 'SM', 'ST', 'VG'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BUILDING B'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['66C27239DA0384FFDD662CB8A543DBAB', 'AD460A03C5F2CBDC967586AF7495D308'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['ADMIRALTY LAW', 'BIOLOGICAL BEHAVIOURAL SCIENCES', 'BREAST IMPLANTS', 'ETHICAL THEORIES', 'INERTIAL NAVIGATION SYSTEM', 'NUCLEAR CHEMISTRY', 'OTHER AGRICULTURAL SCIENCES', 'OTHER MEDICAL SCIENCES'] will be ignored\n",
" warnings.warn(\n",
"[I 2025-05-12 21:42:41,795] Trial 1 finished with value: 0.4292899461188686 and parameters: {'n_estimators': 175, 'max_depth': 5, 'learning_rate': 0.18982366215962182, 'scale_pos_weight': 2.954471437724056}. Best is trial 1 with value: 0.4292899461188686.\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BB', 'BH', 'BN', 'CF', 'GF', 'GT', 'HN', 'KW', 'LR', 'MR', 'OM', 'QA', 'SL', 'SM', 'ST', 'VG'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BUILDING B'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['66C27239DA0384FFDD662CB8A543DBAB', 'AD460A03C5F2CBDC967586AF7495D308'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['ADMIRALTY LAW', 'BIOLOGICAL BEHAVIOURAL SCIENCES', 'BREAST IMPLANTS', 'ETHICAL THEORIES', 'INERTIAL NAVIGATION SYSTEM', 'NUCLEAR CHEMISTRY', 'OTHER AGRICULTURAL SCIENCES', 'OTHER MEDICAL SCIENCES'] will be ignored\n",
" warnings.warn(\n",
"[I 2025-05-12 21:52:48,155] Trial 2 finished with value: 0.4289493329967354 and parameters: {'n_estimators': 156, 'max_depth': 10, 'learning_rate': 0.13243428465407345, 'scale_pos_weight': 3.250588524606016}. Best is trial 1 with value: 0.4292899461188686.\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['AW', 'GM', 'GN', 'GY', 'JE', 'MV'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['CARRETERA AL AJUSCO NUM. 377'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['FI', 'GA', 'NO', 'RO'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['COMPUTATIONAL TOPOLOGY', 'ELECTRICAL ENGINEERING, ELECTRONIC ENGINEERING, INFORMATION ENGINEERING', 'FLORICULTURE', 'FRUGAL ARTIFICIAL INTELLIGENCE', 'GENERAL MEDICINE', 'HISTORY OF PHILOSOPHY', 'INFORMATION ENGINEERING', 'ISLAMIC SCHOOLS', 'OTHER SOCIAL SCIENCES', 'SILICENE', 'ULTRAVIOLET LASERS'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['BI', 'FJ', 'GD', 'GW', 'HT', 'LS', 'PG', 'SV', 'ZZ'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['3D4DDC224DA059CAA718E6F471585874', 'ET', 'HR', 'LT', 'MT', 'PT', 'SR'] will be ignored\n",
" warnings.warn(\n",
"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:900: UserWarning: unknown class(es) ['APPLIED MECHANICS', 'BASIC MEDICINE', 'EDGE ARTIFICIAL INTELLIGENCE', 'FIXED WIRELESS NETWORK', 'ICE GIANTS', 'ORTHODONTICS', 'PERIODONTICS', 'PLANT CLONING', 'REGIONAL HUMAN RIGHTS', 'STRABISMUS'] will be ignored\n",
" warnings.warn(\n",
"[W 2025-05-12 22:00:45,953] Trial 3 failed with parameters: {'n_estimators': 117, 'max_depth': 9, 'learning_rate': 0.09612739086701036, 'scale_pos_weight': 9.26268599189327} because of the following error: KeyboardInterrupt().\n",
"Traceback (most recent call last):\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py\", line 197, in _run_trial\n",
" value_or_values = func(trial)\n",
" ^^^^^^^^^^^\n",
" File \"C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_43912\\262172098.py\", line 256, in objective\n",
" scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(3, shuffle=True, random_state=42),\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 213, in wrapper\n",
" return func(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 712, in cross_val_score\n",
" cv_results = cross_validate(\n",
" ^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py\", line 213, in wrapper\n",
" return func(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 423, in cross_validate\n",
" results = parallel(\n",
" ^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py\", line 74, in __call__\n",
" return super().__call__(iterable_with_config)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py\", line 1918, in __call__\n",
" return output if self.return_generator else list(output)\n",
" ^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py\", line 1847, in _get_sequential_output\n",
" res = func(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py\", line 136, in __call__\n",
" return self.function(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py\", line 888, in _fit_and_score\n",
" estimator.fit(X_train, y_train, **fit_params)\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\base.py\", line 1473, in wrapper\n",
" return fit_method(estimator, *args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py\", line 329, in fit\n",
" Xt, yt = self._fit(X, y, routed_params)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py\", line 265, in _fit\n",
" X, y, fitted_transformer = fit_resample_one_cached(\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\memory.py\", line 312, in __call__\n",
" return self.func(*args, **kwargs)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py\", line 1057, in _fit_resample_one\n",
" X_res, y_res = sampler.fit_resample(X, y, **params.get(\"fit_resample\", {}))\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 208, in fit_resample\n",
" return super().fit_resample(X, y)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 112, in fit_resample\n",
" output = self._fit_resample(X, y)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\combine\\_smote_enn.py\", line 161, in _fit_resample\n",
" return self.enn_.fit_resample(X_res, y_res)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 208, in fit_resample\n",
" return super().fit_resample(X, y)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py\", line 112, in fit_resample\n",
" output = self._fit_resample(X, y)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\under_sampling\\_prototype_selection\\_edited_nearest_neighbours.py\", line 168, in _fit_resample\n",
" nnhood_idx = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\neighbors\\_base.py\", line 849, in kneighbors\n",
" results = ArgKmin.compute(\n",
" ^^^^^^^^^^^^^^^^\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_pairwise_distances_reduction\\_dispatcher.py\", line 278, in compute\n",
" return ArgKmin64.compute(\n",
" ^^^^^^^^^^^^^^^^^^\n",
" File \"sklearn\\\\metrics\\\\_pairwise_distances_reduction\\\\_argkmin.pyx\", line 59, in sklearn.metrics._pairwise_distances_reduction._argkmin.ArgKmin64.compute\n",
" File \"c:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\threadpoolctl.py\", line 592, in __exit__\n",
" def __exit__(self, type, value, traceback):\n",
" \n",
"KeyboardInterrupt\n",
"[W 2025-05-12 22:00:46,241] Trial 3 failed with value None.\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[3], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m status_prediction_model(consolidated_clean\u001b[38;5;241m.\u001b[39mto_pandas())\n",
"Cell \u001b[1;32mIn[2], line 261\u001b[0m, in \u001b[0;36mstatus_prediction_model\u001b[1;34m(df)\u001b[0m\n\u001b[0;32m 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m scores\u001b[38;5;241m.\u001b[39mmean()\n\u001b[0;32m 260\u001b[0m study \u001b[38;5;241m=\u001b[39m optuna\u001b[38;5;241m.\u001b[39mcreate_study(direction\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmaximize\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m--> 261\u001b[0m study\u001b[38;5;241m.\u001b[39moptimize(objective, n_trials\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n\u001b[0;32m 262\u001b[0m best_params \u001b[38;5;241m=\u001b[39m study\u001b[38;5;241m.\u001b[39mbest_trial\u001b[38;5;241m.\u001b[39mparams\n\u001b[0;32m 263\u001b[0m base_model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mbest_params)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\study.py:475\u001b[0m, in \u001b[0;36mStudy.optimize\u001b[1;34m(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[0;32m 373\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21moptimize\u001b[39m(\n\u001b[0;32m 374\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 375\u001b[0m func: ObjectiveFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 382\u001b[0m show_progress_bar: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 383\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 384\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Optimize an objective function.\u001b[39;00m\n\u001b[0;32m 385\u001b[0m \n\u001b[0;32m 386\u001b[0m \u001b[38;5;124;03m Optimization is done by choosing a suitable set of hyperparameter values from a given\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 473\u001b[0m \u001b[38;5;124;03m If nested invocation of this method occurs.\u001b[39;00m\n\u001b[0;32m 474\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 475\u001b[0m _optimize(\n\u001b[0;32m 476\u001b[0m study\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 477\u001b[0m func\u001b[38;5;241m=\u001b[39mfunc,\n\u001b[0;32m 478\u001b[0m n_trials\u001b[38;5;241m=\u001b[39mn_trials,\n\u001b[0;32m 479\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[0;32m 480\u001b[0m n_jobs\u001b[38;5;241m=\u001b[39mn_jobs,\n\u001b[0;32m 481\u001b[0m catch\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mtuple\u001b[39m(catch) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(catch, Iterable) \u001b[38;5;28;01melse\u001b[39;00m (catch,),\n\u001b[0;32m 482\u001b[0m callbacks\u001b[38;5;241m=\u001b[39mcallbacks,\n\u001b[0;32m 483\u001b[0m gc_after_trial\u001b[38;5;241m=\u001b[39mgc_after_trial,\n\u001b[0;32m 484\u001b[0m show_progress_bar\u001b[38;5;241m=\u001b[39mshow_progress_bar,\n\u001b[0;32m 485\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:63\u001b[0m, in \u001b[0;36m_optimize\u001b[1;34m(study, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 62\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_jobs \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m---> 63\u001b[0m _optimize_sequential(\n\u001b[0;32m 64\u001b[0m study,\n\u001b[0;32m 65\u001b[0m func,\n\u001b[0;32m 66\u001b[0m n_trials,\n\u001b[0;32m 67\u001b[0m timeout,\n\u001b[0;32m 68\u001b[0m catch,\n\u001b[0;32m 69\u001b[0m callbacks,\n\u001b[0;32m 70\u001b[0m gc_after_trial,\n\u001b[0;32m 71\u001b[0m reseed_sampler_rng\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 72\u001b[0m time_start\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 73\u001b[0m progress_bar\u001b[38;5;241m=\u001b[39mprogress_bar,\n\u001b[0;32m 74\u001b[0m )\n\u001b[0;32m 75\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 76\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_jobs \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:160\u001b[0m, in \u001b[0;36m_optimize_sequential\u001b[1;34m(study, func, n_trials, timeout, catch, callbacks, gc_after_trial, reseed_sampler_rng, time_start, progress_bar)\u001b[0m\n\u001b[0;32m 157\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m 159\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 160\u001b[0m frozen_trial \u001b[38;5;241m=\u001b[39m _run_trial(study, func, catch)\n\u001b[0;32m 161\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 162\u001b[0m \u001b[38;5;66;03m# The following line mitigates memory problems that can be occurred in some\u001b[39;00m\n\u001b[0;32m 163\u001b[0m \u001b[38;5;66;03m# environments (e.g., services that use computing containers such as GitHub Actions).\u001b[39;00m\n\u001b[0;32m 164\u001b[0m \u001b[38;5;66;03m# Please refer to the following PR for further details:\u001b[39;00m\n\u001b[0;32m 165\u001b[0m \u001b[38;5;66;03m# https://github.com/optuna/optuna/pull/325.\u001b[39;00m\n\u001b[0;32m 166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gc_after_trial:\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:248\u001b[0m, in \u001b[0;36m_run_trial\u001b[1;34m(study, func, catch)\u001b[0m\n\u001b[0;32m 241\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mShould not reach.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 243\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 244\u001b[0m frozen_trial\u001b[38;5;241m.\u001b[39mstate \u001b[38;5;241m==\u001b[39m TrialState\u001b[38;5;241m.\u001b[39mFAIL\n\u001b[0;32m 245\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m func_err \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 246\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(func_err, catch)\n\u001b[0;32m 247\u001b[0m ):\n\u001b[1;32m--> 248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m func_err\n\u001b[0;32m 249\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m frozen_trial\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\optuna\\study\\_optimize.py:197\u001b[0m, in \u001b[0;36m_run_trial\u001b[1;34m(study, func, catch)\u001b[0m\n\u001b[0;32m 195\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m get_heartbeat_thread(trial\u001b[38;5;241m.\u001b[39m_trial_id, study\u001b[38;5;241m.\u001b[39m_storage):\n\u001b[0;32m 196\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 197\u001b[0m value_or_values \u001b[38;5;241m=\u001b[39m func(trial)\n\u001b[0;32m 198\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mTrialPruned \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 199\u001b[0m \u001b[38;5;66;03m# TODO(mamu): Handle multi-objective cases.\u001b[39;00m\n\u001b[0;32m 200\u001b[0m state \u001b[38;5;241m=\u001b[39m TrialState\u001b[38;5;241m.\u001b[39mPRUNED\n",
"Cell \u001b[1;32mIn[2], line 256\u001b[0m, in \u001b[0;36mstatus_prediction_model..objective\u001b[1;34m(trial)\u001b[0m\n\u001b[0;32m 254\u001b[0m base_model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams)\n\u001b[0;32m 255\u001b[0m pipeline \u001b[38;5;241m=\u001b[39m build_pipeline(preprocessor, base_model)\n\u001b[1;32m--> 256\u001b[0m scores \u001b[38;5;241m=\u001b[39m cross_val_score(pipeline, X, y, cv\u001b[38;5;241m=\u001b[39mStratifiedKFold(\u001b[38;5;241m3\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m42\u001b[39m),\n\u001b[0;32m 257\u001b[0m scoring\u001b[38;5;241m=\u001b[39mmake_scorer(f1_score, pos_label\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m))\n\u001b[0;32m 258\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m scores\u001b[38;5;241m.\u001b[39mmean()\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:712\u001b[0m, in \u001b[0;36mcross_val_score\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, error_score)\u001b[0m\n\u001b[0;32m 709\u001b[0m \u001b[38;5;66;03m# To ensure multimetric format is not supported\u001b[39;00m\n\u001b[0;32m 710\u001b[0m scorer \u001b[38;5;241m=\u001b[39m check_scoring(estimator, scoring\u001b[38;5;241m=\u001b[39mscoring)\n\u001b[1;32m--> 712\u001b[0m cv_results \u001b[38;5;241m=\u001b[39m cross_validate(\n\u001b[0;32m 713\u001b[0m estimator\u001b[38;5;241m=\u001b[39mestimator,\n\u001b[0;32m 714\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 715\u001b[0m y\u001b[38;5;241m=\u001b[39my,\n\u001b[0;32m 716\u001b[0m groups\u001b[38;5;241m=\u001b[39mgroups,\n\u001b[0;32m 717\u001b[0m scoring\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mscore\u001b[39m\u001b[38;5;124m\"\u001b[39m: scorer},\n\u001b[0;32m 718\u001b[0m cv\u001b[38;5;241m=\u001b[39mcv,\n\u001b[0;32m 719\u001b[0m n_jobs\u001b[38;5;241m=\u001b[39mn_jobs,\n\u001b[0;32m 720\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[0;32m 721\u001b[0m fit_params\u001b[38;5;241m=\u001b[39mfit_params,\n\u001b[0;32m 722\u001b[0m params\u001b[38;5;241m=\u001b[39mparams,\n\u001b[0;32m 723\u001b[0m pre_dispatch\u001b[38;5;241m=\u001b[39mpre_dispatch,\n\u001b[0;32m 724\u001b[0m error_score\u001b[38;5;241m=\u001b[39merror_score,\n\u001b[0;32m 725\u001b[0m )\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m cv_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtest_score\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:213\u001b[0m, in \u001b[0;36mvalidate_params..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 208\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 209\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 210\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 211\u001b[0m )\n\u001b[0;32m 212\u001b[0m ):\n\u001b[1;32m--> 213\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m InvalidParameterError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 215\u001b[0m \u001b[38;5;66;03m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[0;32m 216\u001b[0m \u001b[38;5;66;03m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[0;32m 217\u001b[0m \u001b[38;5;66;03m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[0;32m 218\u001b[0m \u001b[38;5;66;03m# message to avoid confusion.\u001b[39;00m\n\u001b[0;32m 219\u001b[0m msg \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39msub(\n\u001b[0;32m 220\u001b[0m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+ must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 221\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mparameter of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__qualname__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 222\u001b[0m \u001b[38;5;28mstr\u001b[39m(e),\n\u001b[0;32m 223\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:423\u001b[0m, in \u001b[0;36mcross_validate\u001b[1;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)\u001b[0m\n\u001b[0;32m 420\u001b[0m \u001b[38;5;66;03m# We clone the estimator to make sure that all the folds are\u001b[39;00m\n\u001b[0;32m 421\u001b[0m \u001b[38;5;66;03m# independent, and that it is pickle-able.\u001b[39;00m\n\u001b[0;32m 422\u001b[0m parallel \u001b[38;5;241m=\u001b[39m Parallel(n_jobs\u001b[38;5;241m=\u001b[39mn_jobs, verbose\u001b[38;5;241m=\u001b[39mverbose, pre_dispatch\u001b[38;5;241m=\u001b[39mpre_dispatch)\n\u001b[1;32m--> 423\u001b[0m results \u001b[38;5;241m=\u001b[39m parallel(\n\u001b[0;32m 424\u001b[0m delayed(_fit_and_score)(\n\u001b[0;32m 425\u001b[0m clone(estimator),\n\u001b[0;32m 426\u001b[0m X,\n\u001b[0;32m 427\u001b[0m y,\n\u001b[0;32m 428\u001b[0m scorer\u001b[38;5;241m=\u001b[39mscorers,\n\u001b[0;32m 429\u001b[0m train\u001b[38;5;241m=\u001b[39mtrain,\n\u001b[0;32m 430\u001b[0m test\u001b[38;5;241m=\u001b[39mtest,\n\u001b[0;32m 431\u001b[0m verbose\u001b[38;5;241m=\u001b[39mverbose,\n\u001b[0;32m 432\u001b[0m parameters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 433\u001b[0m fit_params\u001b[38;5;241m=\u001b[39mrouted_params\u001b[38;5;241m.\u001b[39mestimator\u001b[38;5;241m.\u001b[39mfit,\n\u001b[0;32m 434\u001b[0m score_params\u001b[38;5;241m=\u001b[39mrouted_params\u001b[38;5;241m.\u001b[39mscorer\u001b[38;5;241m.\u001b[39mscore,\n\u001b[0;32m 435\u001b[0m return_train_score\u001b[38;5;241m=\u001b[39mreturn_train_score,\n\u001b[0;32m 436\u001b[0m return_times\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[0;32m 437\u001b[0m return_estimator\u001b[38;5;241m=\u001b[39mreturn_estimator,\n\u001b[0;32m 438\u001b[0m error_score\u001b[38;5;241m=\u001b[39merror_score,\n\u001b[0;32m 439\u001b[0m )\n\u001b[0;32m 440\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m train, test \u001b[38;5;129;01min\u001b[39;00m indices\n\u001b[0;32m 441\u001b[0m )\n\u001b[0;32m 443\u001b[0m _warn_or_raise_about_fit_failures(results, error_score)\n\u001b[0;32m 445\u001b[0m \u001b[38;5;66;03m# For callable scoring, the return type is only know after calling. If the\u001b[39;00m\n\u001b[0;32m 446\u001b[0m \u001b[38;5;66;03m# return type is a dictionary, the error scores can now be inserted with\u001b[39;00m\n\u001b[0;32m 447\u001b[0m \u001b[38;5;66;03m# the correct key.\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py:74\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 69\u001b[0m config \u001b[38;5;241m=\u001b[39m get_config()\n\u001b[0;32m 70\u001b[0m iterable_with_config \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 71\u001b[0m (_with_config(delayed_func, config), args, kwargs)\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m delayed_func, args, kwargs \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[0;32m 73\u001b[0m )\n\u001b[1;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m(iterable_with_config)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1916\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m 1917\u001b[0m \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mlist\u001b[39m(output)\n\u001b[0;32m 1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m 1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m 1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m 1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m 1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m 1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m 1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\utils\\parallel.py:136\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 134\u001b[0m config \u001b[38;5;241m=\u001b[39m {}\n\u001b[0;32m 135\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mconfig):\n\u001b[1;32m--> 136\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py:888\u001b[0m, in \u001b[0;36m_fit_and_score\u001b[1;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)\u001b[0m\n\u001b[0;32m 886\u001b[0m estimator\u001b[38;5;241m.\u001b[39mfit(X_train, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m 887\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 888\u001b[0m estimator\u001b[38;5;241m.\u001b[39mfit(X_train, y_train, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m 890\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[0;32m 891\u001b[0m \u001b[38;5;66;03m# Note fit time as time until error\u001b[39;00m\n\u001b[0;32m 892\u001b[0m fit_time \u001b[38;5;241m=\u001b[39m time\u001b[38;5;241m.\u001b[39mtime() \u001b[38;5;241m-\u001b[39m start_time\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[1;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[0;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[0;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[0;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[0;32m 1471\u001b[0m )\n\u001b[0;32m 1472\u001b[0m ):\n\u001b[1;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fit_method(estimator, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py:329\u001b[0m, in \u001b[0;36mPipeline.fit\u001b[1;34m(self, X, y, **params)\u001b[0m\n\u001b[0;32m 285\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Fit the model.\u001b[39;00m\n\u001b[0;32m 286\u001b[0m \n\u001b[0;32m 287\u001b[0m \u001b[38;5;124;03mFit all the transforms/samplers one after the other and\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 326\u001b[0m \u001b[38;5;124;03m This estimator.\u001b[39;00m\n\u001b[0;32m 327\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 328\u001b[0m routed_params \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_method_params(method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit\u001b[39m\u001b[38;5;124m\"\u001b[39m, props\u001b[38;5;241m=\u001b[39mparams)\n\u001b[1;32m--> 329\u001b[0m Xt, yt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit(X, y, routed_params)\n\u001b[0;32m 330\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)):\n\u001b[0;32m 331\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_final_estimator \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassthrough\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py:265\u001b[0m, in \u001b[0;36mPipeline._fit\u001b[1;34m(self, X, y, routed_params)\u001b[0m\n\u001b[0;32m 255\u001b[0m X, fitted_transformer \u001b[38;5;241m=\u001b[39m fit_transform_one_cached(\n\u001b[0;32m 256\u001b[0m cloned_transformer,\n\u001b[0;32m 257\u001b[0m X,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 262\u001b[0m params\u001b[38;5;241m=\u001b[39mrouted_params[name],\n\u001b[0;32m 263\u001b[0m )\n\u001b[0;32m 264\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(cloned_transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_resample\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m--> 265\u001b[0m X, y, fitted_transformer \u001b[38;5;241m=\u001b[39m fit_resample_one_cached(\n\u001b[0;32m 266\u001b[0m cloned_transformer,\n\u001b[0;32m 267\u001b[0m X,\n\u001b[0;32m 268\u001b[0m y,\n\u001b[0;32m 269\u001b[0m message_clsname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPipeline\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 270\u001b[0m message\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_log_message(step_idx),\n\u001b[0;32m 271\u001b[0m params\u001b[38;5;241m=\u001b[39mrouted_params[name],\n\u001b[0;32m 272\u001b[0m )\n\u001b[0;32m 273\u001b[0m \u001b[38;5;66;03m# Replace the transformer of the step with the fitted\u001b[39;00m\n\u001b[0;32m 274\u001b[0m \u001b[38;5;66;03m# transformer. This is necessary when loading the transformer\u001b[39;00m\n\u001b[0;32m 275\u001b[0m \u001b[38;5;66;03m# from the cache.\u001b[39;00m\n\u001b[0;32m 276\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[step_idx] \u001b[38;5;241m=\u001b[39m (name, fitted_transformer)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\joblib\\memory.py:312\u001b[0m, in \u001b[0;36mNotMemorizedFunc.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 311\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m--> 312\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunc(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\pipeline.py:1057\u001b[0m, in \u001b[0;36m_fit_resample_one\u001b[1;34m(sampler, X, y, message_clsname, message, params)\u001b[0m\n\u001b[0;32m 1055\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_fit_resample_one\u001b[39m(sampler, X, y, message_clsname\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m, message\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, params\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n\u001b[0;32m 1056\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[1;32m-> 1057\u001b[0m X_res, y_res \u001b[38;5;241m=\u001b[39m sampler\u001b[38;5;241m.\u001b[39mfit_resample(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparams\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_resample\u001b[39m\u001b[38;5;124m\"\u001b[39m, {}))\n\u001b[0;32m 1059\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X_res, y_res, sampler\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_resample(X, y)\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\combine\\_smote_enn.py:161\u001b[0m, in \u001b[0;36mSMOTEENN._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 158\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy\n\u001b[0;32m 160\u001b[0m X_res, y_res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msmote_\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n\u001b[1;32m--> 161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39menn_\u001b[38;5;241m.\u001b[39mfit_resample(X_res, y_res)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:208\u001b[0m, in \u001b[0;36mBaseSampler.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Resample the dataset.\u001b[39;00m\n\u001b[0;32m 188\u001b[0m \n\u001b[0;32m 189\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[38;5;124;03m The corresponding label of `X_resampled`.\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m--> 208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\base.py:112\u001b[0m, in \u001b[0;36mSamplerMixin.fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 106\u001b[0m X, y, binarize_y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_X_y(X, y)\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy_ \u001b[38;5;241m=\u001b[39m check_sampling_strategy(\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msampling_strategy, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sampling_type\n\u001b[0;32m 110\u001b[0m )\n\u001b[1;32m--> 112\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_resample(X, y)\n\u001b[0;32m 114\u001b[0m y_ \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 115\u001b[0m label_binarize(output[\u001b[38;5;241m1\u001b[39m], classes\u001b[38;5;241m=\u001b[39mnp\u001b[38;5;241m.\u001b[39munique(y)) \u001b[38;5;28;01mif\u001b[39;00m binarize_y \u001b[38;5;28;01melse\u001b[39;00m output[\u001b[38;5;241m1\u001b[39m]\n\u001b[0;32m 116\u001b[0m )\n\u001b[0;32m 118\u001b[0m X_, y_ \u001b[38;5;241m=\u001b[39m arrays_transformer\u001b[38;5;241m.\u001b[39mtransform(output[\u001b[38;5;241m0\u001b[39m], y_)\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\imblearn\\under_sampling\\_prototype_selection\\_edited_nearest_neighbours.py:168\u001b[0m, in \u001b[0;36mEditedNearestNeighbours._fit_resample\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m 166\u001b[0m X_class \u001b[38;5;241m=\u001b[39m _safe_indexing(X, target_class_indices)\n\u001b[0;32m 167\u001b[0m y_class \u001b[38;5;241m=\u001b[39m _safe_indexing(y, target_class_indices)\n\u001b[1;32m--> 168\u001b[0m nnhood_idx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnn_\u001b[38;5;241m.\u001b[39mkneighbors(X_class, return_distance\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)[:, \u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m 169\u001b[0m nnhood_label \u001b[38;5;241m=\u001b[39m y[nnhood_idx]\n\u001b[0;32m 170\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkind_sel \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmode\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\neighbors\\_base.py:849\u001b[0m, in \u001b[0;36mKNeighborsMixin.kneighbors\u001b[1;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[0;32m 842\u001b[0m use_pairwise_distances_reductions \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 843\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbrute\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 844\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m ArgKmin\u001b[38;5;241m.\u001b[39mis_usable_for(\n\u001b[0;32m 845\u001b[0m X \u001b[38;5;28;01mif\u001b[39;00m X \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_X, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_X, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meffective_metric_\n\u001b[0;32m 846\u001b[0m )\n\u001b[0;32m 847\u001b[0m )\n\u001b[0;32m 848\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_pairwise_distances_reductions:\n\u001b[1;32m--> 849\u001b[0m results \u001b[38;5;241m=\u001b[39m ArgKmin\u001b[38;5;241m.\u001b[39mcompute(\n\u001b[0;32m 850\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 851\u001b[0m Y\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_X,\n\u001b[0;32m 852\u001b[0m k\u001b[38;5;241m=\u001b[39mn_neighbors,\n\u001b[0;32m 853\u001b[0m metric\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meffective_metric_,\n\u001b[0;32m 854\u001b[0m metric_kwargs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39meffective_metric_params_,\n\u001b[0;32m 855\u001b[0m strategy\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 856\u001b[0m return_distance\u001b[38;5;241m=\u001b[39mreturn_distance,\n\u001b[0;32m 857\u001b[0m )\n\u001b[0;32m 859\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m (\n\u001b[0;32m 860\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fit_method \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbrute\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmetric \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprecomputed\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m issparse(X)\n\u001b[0;32m 861\u001b[0m ):\n\u001b[0;32m 862\u001b[0m results \u001b[38;5;241m=\u001b[39m _kneighbors_from_graph(\n\u001b[0;32m 863\u001b[0m X, n_neighbors\u001b[38;5;241m=\u001b[39mn_neighbors, return_distance\u001b[38;5;241m=\u001b[39mreturn_distance\n\u001b[0;32m 864\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\sklearn\\metrics\\_pairwise_distances_reduction\\_dispatcher.py:278\u001b[0m, in \u001b[0;36mArgKmin.compute\u001b[1;34m(cls, X, Y, k, metric, chunk_size, metric_kwargs, strategy, return_distance)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Compute the argkmin reduction.\u001b[39;00m\n\u001b[0;32m 198\u001b[0m \n\u001b[0;32m 199\u001b[0m \u001b[38;5;124;03mParameters\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 275\u001b[0m \u001b[38;5;124;03mreturns.\u001b[39;00m\n\u001b[0;32m 276\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m Y\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat64:\n\u001b[1;32m--> 278\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ArgKmin64\u001b[38;5;241m.\u001b[39mcompute(\n\u001b[0;32m 279\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 280\u001b[0m Y\u001b[38;5;241m=\u001b[39mY,\n\u001b[0;32m 281\u001b[0m k\u001b[38;5;241m=\u001b[39mk,\n\u001b[0;32m 282\u001b[0m metric\u001b[38;5;241m=\u001b[39mmetric,\n\u001b[0;32m 283\u001b[0m chunk_size\u001b[38;5;241m=\u001b[39mchunk_size,\n\u001b[0;32m 284\u001b[0m metric_kwargs\u001b[38;5;241m=\u001b[39mmetric_kwargs,\n\u001b[0;32m 285\u001b[0m strategy\u001b[38;5;241m=\u001b[39mstrategy,\n\u001b[0;32m 286\u001b[0m return_distance\u001b[38;5;241m=\u001b[39mreturn_distance,\n\u001b[0;32m 287\u001b[0m )\n\u001b[0;32m 289\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m X\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m Y\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m np\u001b[38;5;241m.\u001b[39mfloat32:\n\u001b[0;32m 290\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ArgKmin32\u001b[38;5;241m.\u001b[39mcompute(\n\u001b[0;32m 291\u001b[0m X\u001b[38;5;241m=\u001b[39mX,\n\u001b[0;32m 292\u001b[0m Y\u001b[38;5;241m=\u001b[39mY,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 298\u001b[0m return_distance\u001b[38;5;241m=\u001b[39mreturn_distance,\n\u001b[0;32m 299\u001b[0m )\n",
"File \u001b[1;32msklearn\\\\metrics\\\\_pairwise_distances_reduction\\\\_argkmin.pyx:59\u001b[0m, in \u001b[0;36msklearn.metrics._pairwise_distances_reduction._argkmin.ArgKmin64.compute\u001b[1;34m()\u001b[0m\n",
"File \u001b[1;32mc:\\Users\\Romain\\anaconda3\\Lib\\site-packages\\threadpoolctl.py:592\u001b[0m, in \u001b[0;36m_ThreadpoolLimiter.__exit__\u001b[1;34m(self, type, value, traceback)\u001b[0m\n\u001b[0;32m 589\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__enter__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m 590\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m--> 592\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28mtype\u001b[39m, value, traceback):\n\u001b[0;32m 593\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrestore_original_limits()\n\u001b[0;32m 595\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[0;32m 596\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrap\u001b[39m(\u001b[38;5;28mcls\u001b[39m, controller, \u001b[38;5;241m*\u001b[39m, limits\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, user_api\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m):\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mFailed to interrupt the Kernel. \n",
"\u001b[1;31mUnable to start Kernel 'base (Python 3.12.7)' due to a timeout waiting for the ports to get used. \n",
"\u001b[1;31mView Jupyter log for further details."
]
}
],
"source": [
"status_prediction_model(consolidated_clean.to_pandas())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import joblib\n",
"import numpy as np\n",
"import pandas as pd\n",
"import shap\n",
"import matplotlib.pyplot as plt\n",
"import scipy.sparse\n",
"\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.pipeline import Pipeline as SKPipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold\n",
"from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold\n",
"from sklearn.metrics import classification_report, ConfusionMatrixDisplay, f1_score, make_scorer\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
"from sklearn.ensemble import IsolationForest\n",
"\n",
"from imblearn.pipeline import Pipeline as ImbPipeline\n",
"from imblearn.combine import SMOTEENN\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"from xgboost import XGBClassifier\n",
"\n",
"from evidently import Report\n",
"from evidently.presets import DataDriftPreset\n",
"\n",
"import optuna\n",
"\n",
"\n",
"# --- Custom Transformers ---\n",
"class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):\n",
" def fit(self, X,y=None):\n",
" self.col = X.name\n",
" self.mlb = MultiLabelBinarizer()\n",
" self.mlb.fit(X)\n",
" return self\n",
" def transform(self, X):\n",
" return self.mlb.transform(X)\n",
" def get_feature_names_out(self, input_features=None):\n",
" return [f\"{self.col}_{cls}\" for cls in self.mlb.classes_]\n",
" def get_params(self, deep=True):\n",
" return {}\n",
" def set_params(self, **params):\n",
" return self\n",
"\n",
"class AnomalyScoreTransformer(BaseEstimator, TransformerMixin):\n",
" def __init__(self):\n",
" self.model = IsolationForest(n_estimators=200, contamination=0.1, random_state=42)\n",
"\n",
" def fit(self, X, y=None):\n",
" self.model.fit(X)\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" scores = -self.model.decision_function(X)\n",
" return np.hstack([X, scores.reshape(-1, 1)])\n",
"\n",
"# --- Step 1: Data Preparation ---\n",
"def prepare_data(df, is_train=True, model_dir=\"model_artifacts\"):\n",
" df = df.copy()\n",
" \n",
" if is_train:\n",
" df['status'] = df['status'].astype(str).str.upper()\n",
" df = df[df['status'].isin(['CLOSED', 'TERMINATED'])]\n",
" df['label'] = df['status'].map({'CLOSED': 0, 'TERMINATED': 1})\n",
" assert df['label'].notna().all(), \"Label column still has NaNs!\"\n",
"\n",
" multilabel_fields = [\n",
" 'list_country', 'list_activityType', 'list_deliverableType',\n",
" 'list_availableLanguages', 'list_euroSciVocTitle'\n",
" ]\n",
"\n",
" def extract_intermediate_levels(paths):\n",
" tokens = []\n",
" if isinstance(paths, list):\n",
" for p in paths:\n",
" parts = p.strip('/').split('/')\n",
" tokens.extend(parts[:-1])\n",
" return list(set(tokens))\n",
" df['euroSciVoc_intermediate'] = df['list_euroSciVocPath'].apply(extract_intermediate_levels)\n",
" multilabel_fields.append('euroSciVoc_intermediate')\n",
" \n",
" for col in multilabel_fields:\n",
" df[col] = df[col].apply(lambda x: [] if x is None else (x.tolist() if hasattr(x, 'tolist') else x))\n",
" df[col] = df[col].apply(lambda x: list(x) if not isinstance(x, list) else x)\n",
" df[col] = df[col].apply(lambda x: [item for item in x if item is not None])\n",
" df[col] = df[col].apply(lambda x: [str(item).upper() for item in x])\n",
"\n",
" \n",
" def split_languages(lang_list):\n",
" if not isinstance(lang_list, list):\n",
" return []\n",
" result = []\n",
" for entry in lang_list:\n",
" if isinstance(entry, str):\n",
" result.extend(entry.split(\",\"))\n",
" return result\n",
"\n",
" df[\"list_availableLanguages\"] = df[\"list_availableLanguages\"].apply(split_languages)\n",
" \n",
" df['topic_title'] = df['list_title_topic'].apply(\n",
" lambda x: (x.tolist() if hasattr(x, 'tolist') else x)[0] if x is not None and len(x) > 0 else \"unknown_topic\"\n",
" )\n",
"\n",
" for col in ['title', 'objective', 'topic_title']:\n",
" df[col] = df[col].fillna(\"\").astype(str)\n",
"\n",
" df['n_partners'] = df['list_name'].apply(\n",
" lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)\n",
" )\n",
"\n",
" df['n_country'] = df['list_country'].apply(\n",
" lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)\n",
" )\n",
"\n",
" df['n_sme'] = df['list_SME'].apply(\n",
" lambda x: sum(1 for i in (x.tolist() if hasattr(x, 'tolist') else x) if i is True)\n",
" if x is not None and (hasattr(x, 'tolist') or isinstance(x, list)) else 0\n",
" )\n",
"\n",
" return df\n",
"\n",
"# --- Step 2: Text Embedding ---\n",
"def compute_embeddings(df, text_columns, model_name='paraphrase-multilingual-MiniLM-L12-v2', svd_dim=50):\n",
" model = SentenceTransformer(model_name)\n",
" os.makedirs(\"model_artifacts\", exist_ok=True)\n",
" os.makedirs(\"embeddings\", exist_ok=True)\n",
" for col in text_columns:\n",
" embedding_file = f\"embeddings/{col}_embeddings.npy\"\n",
" svd_file = f\"model_artifacts/{col}_svd.pkl\"\n",
" if os.path.exists(embedding_file):\n",
" print(f\"Loading saved embeddings for column '{col}'...\")\n",
" embeddings = np.load(embedding_file)\n",
" else:\n",
" print(f\"Computing embeddings for column '{col}'...\")\n",
" embeddings = model.encode(df[col].tolist(), show_progress_bar=True)\n",
" np.save(embedding_file, embeddings)\n",
"\n",
" print(f\"Fitting SVD for column '{col}'...\")\n",
" svd = TruncatedSVD(n_components=svd_dim, random_state=42)\n",
" svd.fit(embeddings)\n",
" joblib.dump(svd, svd_file)\n",
"\n",
" reduced = svd.transform(embeddings)\n",
" embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])])\n",
" embed_df.index = df.index # Force matching index\n",
" df = pd.concat([df, embed_df], axis=1)\n",
" return df\n",
"\n",
"\n",
"# --- Step 3: Build Preprocessor ---\n",
"def build_preprocessor(numeric_features, categorical_features, multilabel_fields):\n",
" numeric_pipeline = SKPipeline([\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('anomaly', AnomalyScoreTransformer()),\n",
" ('scaler', StandardScaler())\n",
" ])\n",
"\n",
" categorical_pipeline = SKPipeline([\n",
" ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n",
" ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n",
" ])\n",
"\n",
" transformers = [\n",
" ('num', numeric_pipeline, numeric_features),\n",
" ('cat', categorical_pipeline, categorical_features),\n",
" *[(f'mlb_{col}', MultiLabelBinarizerTransformer(), col) for col in multilabel_fields]]\n",
" \n",
"\n",
" return ColumnTransformer(transformers, sparse_threshold=0.0)\n",
"\n",
"# --- Step 4: Build Pipeline ---\n",
"def build_pipeline(preprocessor, base_model, k=250):\n",
" return ImbPipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('anomaly', AnomalyScoreTransformer()),\n",
" ('resample', SMOTEENN()),\n",
" (\"variance_filter\", VarianceThreshold(threshold=0.0)),\n",
" ('feature_select', SelectKBest(score_func=f_classif, k=k)),\n",
" ('classifier', CalibratedClassifierCV(estimator=base_model, method='isotonic', cv=3))\n",
" ])\n",
"\n",
"# --- Step 5: Drift Monitoring ---\n",
"def monitor_drift(reference, current, feature_names, output_html='drift_report.html'):\n",
" ref_df = pd.DataFrame(reference, columns=feature_names)\n",
" cur_df = pd.DataFrame(current, columns=feature_names)\n",
" \n",
" report = Report(metrics=[DataDriftPreset()])\n",
" report.run(reference_data=ref_df, current_data=cur_df)\n",
" report.save_html(output_html)\n",
" print(f\"✅ Drift report saved to {output_html}\")\n",
"\n",
"\n",
"# --- Step 6: Evaluation + SHAP ---\n",
"def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names):\n",
" model.fit(X_train, y_train)\n",
" y_pred = model.predict(X_test)\n",
" print(classification_report(y_test, y_pred))\n",
" ConfusionMatrixDisplay.from_predictions(y_test, y_pred)\n",
" plt.title(\"Evaluation\")\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
" X_proc = model.named_steps['preprocessor'].transform(X_test)\n",
" if scipy.sparse.issparse(X_proc):\n",
" X_proc = X_proc.toarray()\n",
"\n",
" selector = model.named_steps['feature_select']\n",
" X_selected = selector.transform(X_proc)\n",
"\n",
" explainer = shap.Explainer(model.named_steps['classifier'].base_estimator, feature_names=feature_names)\n",
" shap_values = explainer(X_selected)\n",
" shap.summary_plot(shap_values, X_selected)\n",
"\n",
"# --- Final Orchestration ---\n",
"def status_prediction_model(df):\n",
" os.makedirs(\"model_artifacts\", exist_ok=True)\n",
" print(\"🧹 Preparing data...\")\n",
" df = prepare_data(df, is_train=True)\n",
" print(\"💡 Embedding text...\")\n",
" df = compute_embeddings(df, ['title', 'objective', 'topic_title'])\n",
"\n",
" text_embed_cols = [col for col in df.columns if '_embed_' in col]\n",
" numeric_features = ['durationDays', 'startYear', 'ecMaxContribution', 'totalCost',\n",
" 'n_partners', 'n_country', 'n_sme'] + text_embed_cols\n",
" categorical_features = ['frameworkProgramme', 'fundingScheme', 'legalBasis', 'nature']\n",
" multilabel_fields = ['list_country', 'list_activityType', 'list_deliverableType',\n",
" 'list_availableLanguages', 'list_euroSciVocTitle','euroSciVoc_intermediate']\n",
" \n",
" \n",
" df = df[numeric_features + categorical_features + multilabel_fields + ['label']]\n",
" X = df.drop(columns='label')\n",
" y = df['label']\n",
"\n",
"\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)\n",
"\n",
" print(\"🧱 Building pipeline...\")\n",
" preprocessor = build_preprocessor(numeric_features, categorical_features, multilabel_fields)\n",
" base_model = XGBClassifier(eval_metric='logloss', n_jobs=-1)\n",
"\n",
" print(\"🎯 Training model with Optuna...\")\n",
" def objective(trial):\n",
" params = {\n",
" 'n_estimators': trial.suggest_int('n_estimators', 100, 300),\n",
" 'max_depth': trial.suggest_int('max_depth', 3, 10),\n",
" 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),\n",
" 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2.0, 10.0)\n",
" }\n",
" base_model.set_params(**params)\n",
" pipeline = build_pipeline(preprocessor, base_model)\n",
" scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(3, shuffle=True, random_state=42),\n",
" scoring=make_scorer(f1_score, pos_label=1))\n",
" return scores.mean()\n",
"\n",
" study = optuna.create_study(direction='maximize')\n",
" study.optimize(objective, n_trials=10)\n",
" best_params = study.best_trial.params\n",
" base_model.set_params(**best_params)\n",
"\n",
" print(\"✅ Training final model and evaluating...\")\n",
" final_pipeline = build_pipeline(preprocessor, base_model)\n",
" selector = final_pipeline.named_steps['feature_select']\n",
" if hasattr(selector, 'get_support'):\n",
" feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())[selector.get_support()]\n",
" else:\n",
" feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())\n",
" evaluate_model(final_pipeline, X_train, X_test, y_train, y_test, feature_names)\n",
"\n",
" print(\"📊 Monitoring drift...\")\n",
" ref_data = preprocessor.transform(X_train)\n",
" cur_data = preprocessor.transform(X_test)\n",
" if scipy.sparse.issparse(ref_data): ref_data = ref_data.toarray()\n",
" if scipy.sparse.issparse(cur_data): cur_data = cur_data.toarray()\n",
" monitor_drift(pd.DataFrame(ref_data), pd.DataFrame(cur_data), feature_names)\n",
" print(\"💾 Saving model and artifacts...\")\n",
" joblib.dump(final_pipeline, \"model_artifacts/model.pkl\")\n",
" joblib.dump(preprocessor, \"model_artifacts/preprocessor.pkl\")\n",
" X_train.to_csv(\"model_artifacts/X_train_processed.csv\", index=False)\n",
" y_train.to_csv(\"model_artifacts/y_train.csv\", index=False)\n",
" feature_config = {\n",
" \"numeric_features\": numeric_features,\n",
" \"categorical_features\": categorical_features,\n",
" \"multilabel_fields\": multilabel_fields\n",
" }\n",
" json.dump(feature_config, open(\"model_artifacts/feature_config.json\", \"w\"))\n",
" print(\"✅ Training complete. Model artifacts saved.\")\n",
"\n",
"def score(new_data_df, model_dir=\"model_artifacts\"):\n",
" # Load saved artifacts\n",
" model = joblib.load(os.path.join(model_dir, \"model.pkl\"))\n",
" config = json.load(open(os.path.join(model_dir, \"feature_config.json\")))\n",
" text_cols = ['title', 'objective', 'topic_title']\n",
" numeric_features = config[\"numeric_features\"]\n",
" categorical_features = config[\"categorical_features\"]\n",
" multilabel_fields = config[\"multilabel_fields\"]\n",
" new_data_df = prepare_data(new_data_df, is_train=False)\n",
" # Text embedding using saved SVDs\n",
" sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')\n",
" for col in text_cols:\n",
" svd = joblib.load(os.path.join(model_dir, f\"{col}_svd.pkl\"))\n",
" emb = sbert.encode(new_data_df[col].tolist(), show_progress_bar=False)\n",
" reduced = svd.transform(emb)\n",
" embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])])\n",
" df = pd.concat([df, embed_df], axis=1)\n",
"\n",
" # Final input set\n",
" final_X = new_data_df[numeric_features + categorical_features + multilabel_fields]\n",
" pred = model.predict(final_X)\n",
" prob = model.predict_proba(final_X)\n",
"\n",
" return pred, prob\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import classification_report\n",
"import pandas as pd\n",
"\n",
"def plot_project_duration_distribution(df):\n",
" if 'durationDays' in df.columns:\n",
" data = pd.to_numeric(df['durationDays'], errors='coerce').dropna()\n",
" plt.figure(figsize=(10,6))\n",
" sns.histplot(data, bins=50)\n",
" plt.title('Distribution of Project Durations (days)')\n",
" plt.xlabel('Duration (days)')\n",
" plt.ylabel('Number of Projects')\n",
" plt.show()\n",
" else:\n",
" print(\"Column 'durationDays' not found in DataFrame.\")\n",
"\n",
"def plot_ec_contribution_by_year(df):\n",
" if 'startYear' in df.columns and 'ecMaxContribution' in df.columns:\n",
" df['startYear'] = pd.to_numeric(df['startYear'], errors='coerce')\n",
" df['ecMaxContribution'] = pd.to_numeric(df['ecMaxContribution'], errors='coerce')\n",
" yearly_funding = df.groupby('startYear')['ecMaxContribution'].sum().dropna()\n",
" plt.figure(figsize=(10,6))\n",
" yearly_funding.plot(kind='bar')\n",
" plt.title('Total EC Max Contribution by Start Year')\n",
" plt.ylabel('Total Contribution (€)')\n",
" plt.xlabel('Start Year')\n",
" plt.xticks(rotation=45)\n",
" plt.tight_layout()\n",
" plt.show()\n",
" else:\n",
" print(\"Required columns not found in DataFrame.\")\n",
"\n",
"def plot_participation_by_country(df):\n",
" if 'list_country' in df.columns:\n",
" countries = df['list_country'].explode()\n",
" countries = countries.dropna()\n",
" top_countries = countries.value_counts().head(15)\n",
" plt.figure(figsize=(10,6))\n",
" top_countries.plot(kind='bar')\n",
" plt.title('Top 15 Countries by Project Participation')\n",
" plt.ylabel('Number of Participations')\n",
" plt.xticks(rotation=45)\n",
" plt.tight_layout()\n",
" plt.show()\n",
" else:\n",
" print(\"Column 'list_country' not found in DataFrame.\")\n",
"\n",
"def success_prediction_model(df):\n",
" df = df.copy()\n",
"\n",
" # Define binary target variable\n",
" df['target'] = df['status'].apply(lambda x: 1 if str(x).upper() == 'CLOSED' else 0)\n",
"\n",
" # Feature selection\n",
" features = ['durationDays', 'ecMaxContribution', 'netEcContribution', 'startYear', 'endYear', 'title', 'objective']\n",
" df = df[features + ['target']].dropna(subset=['target'])\n",
"\n",
" # Ensure selected features exist\n",
" for col in features:\n",
" if col not in df.columns:\n",
" print(f\"Missing expected column: {col}\")\n",
" return\n",
"\n",
" # Fill missing text with empty string\n",
" df['title'] = df['title'].fillna('').astype(str)\n",
" df['objective'] = df['objective'].fillna('').astype(str)\n",
"\n",
" numeric_features = ['durationDays', 'ecMaxContribution', 'netEcContribution', 'startYear', 'endYear']\n",
" text_features_title = 'title'\n",
" text_features_objective = 'objective'\n",
"\n",
" preprocessor = ColumnTransformer(transformers=[\n",
" ('num', Pipeline([\n",
" ('imputer', SimpleImputer(strategy='median')),\n",
" ('scaler', StandardScaler())\n",
" ]), numeric_features),\n",
" ('title_tfidf', TfidfVectorizer(max_features=100), text_features_title),\n",
" ('objective_tfidf', TfidfVectorizer(max_features=100), text_features_objective),\n",
" ])\n",
"\n",
" clf = Pipeline(steps=[\n",
" ('preprocessor', preprocessor),\n",
" ('classifier', LogisticRegression(max_iter=1000))\n",
" ])\n",
"\n",
" X = df[features]\n",
" y = df['target']\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
"\n",
" clf.fit(X_train, y_train)\n",
" y_pred = clf.predict(X_test)\n",
"\n",
" print(\"✅ Classification Report:\")\n",
" print(classification_report(y_test, y_pred))\n",
" \n",
"\n",
"def cluster_projects(df, n_clusters=5):\n",
" df = df.copy()\n",
" features = ['durationDays', 'ecMaxContribution', 'netEcContribution']\n",
" df = df[features].dropna()\n",
" \n",
" df[features] = df[features].apply(pd.to_numeric, errors='coerce')\n",
" df = df.dropna()\n",
"\n",
" scaler = StandardScaler()\n",
" scaled_features = scaler.fit_transform(df)\n",
"\n",
" kmeans = KMeans(n_clusters=n_clusters, random_state=42)\n",
" clusters = kmeans.fit_predict(scaled_features)\n",
"\n",
" plt.figure(figsize=(10,6))\n",
" plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=clusters, cmap='viridis')\n",
" plt.title('Project Clusters')\n",
" plt.xlabel('Scaled Duration')\n",
" plt.ylabel('Scaled EC Contribution')\n",
" plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Romain\\AppData\\Local\\Temp\\ipykernel_17684\\589813601.py:6: DeprecationWarning: `GroupBy.count` is deprecated. It has been renamed to `len`.\n",
" .count()\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"
\n",
" Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.\n",
"
["RHEINISCH-WESTFAELISCHE TECHNISCHE HOCHSCHULE AACHEN", "IDRYMA TECHNOLOGIAS KAI EREVNAS", "MAX-PLANCK-GESELLSCHAFT ZUR FORDERUNG DER WISSENSCHAFTEN EV"]