Spaces:
Sleeping
Sleeping
Commit
·
69e8901
1
Parent(s):
fa1006f
updates
Browse files- DataExploration.ipynb +57 -4
- backend/main.py +15 -12
- data_enhancement.py +442 -0
- predictive_modelling.py +360 -0
- rag_test.py +0 -223
DataExploration.ipynb
CHANGED
@@ -599,19 +599,72 @@
|
|
599 |
},
|
600 |
{
|
601 |
"cell_type": "code",
|
602 |
-
"execution_count":
|
603 |
"metadata": {},
|
604 |
"outputs": [],
|
605 |
"source": [
|
606 |
"import polars as pl\n",
|
607 |
"import pathlib\n",
|
608 |
-
"ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\
|
609 |
-
"OUTDIR = ROOT
|
610 |
"\n",
|
611 |
-
"consolidated = pl.read_parquet(OUTDIR / \"consolidated.parquet\")\n",
|
612 |
"consolidated_clean = pl.read_parquet(OUTDIR / \"consolidated_clean.parquet\")\n"
|
613 |
]
|
614 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
615 |
{
|
616 |
"cell_type": "markdown",
|
617 |
"metadata": {},
|
|
|
599 |
},
|
600 |
{
|
601 |
"cell_type": "code",
|
602 |
+
"execution_count": 2,
|
603 |
"metadata": {},
|
604 |
"outputs": [],
|
605 |
"source": [
|
606 |
"import polars as pl\n",
|
607 |
"import pathlib\n",
|
608 |
+
"ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\MDA\\backend\\data\")\n",
|
609 |
+
"OUTDIR = ROOT #/ \"combined\"\n",
|
610 |
"\n",
|
611 |
+
"#consolidated = pl.read_parquet(OUTDIR / \"consolidated.parquet\")\n",
|
612 |
"consolidated_clean = pl.read_parquet(OUTDIR / \"consolidated_clean.parquet\")\n"
|
613 |
]
|
614 |
},
|
615 |
+
{
|
616 |
+
"cell_type": "code",
|
617 |
+
"execution_count": null,
|
618 |
+
"metadata": {},
|
619 |
+
"outputs": [
|
620 |
+
{
|
621 |
+
"data": {
|
622 |
+
"text/html": [
|
623 |
+
"<div><style>\n",
|
624 |
+
".dataframe > thead > tr,\n",
|
625 |
+
".dataframe > tbody > tr {\n",
|
626 |
+
" text-align: right;\n",
|
627 |
+
" white-space: pre-wrap;\n",
|
628 |
+
"}\n",
|
629 |
+
"</style>\n",
|
630 |
+
"<small>shape: (5, 68)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>id</th><th>acronym</th><th>status</th><th>title</th><th>startDate</th><th>endDate</th><th>totalCost</th><th>ecMaxContribution</th><th>legalBasis</th><th>topics</th><th>ecSignatureDate</th><th>frameworkProgramme</th><th>masterCall</th><th>subCall</th><th>fundingScheme</th><th>nature</th><th>objective</th><th>contentUpdateDate</th><th>rcn</th><th>grantDoi</th><th>programmeFolder</th><th>list_deliverableType</th><th>list_url</th><th>list_contentUpdateDate</th><th>list_authors</th><th>list_title</th><th>list_doi</th><th>list_journalTitle</th><th>list_isPublishedAs</th><th>list_publishedYear</th><th>list_contentUpdateDate_publi</th><th>list_title_report</th><th>list_attachment</th><th>list_contentUpdateDate_report</th><th>list_organisationID</th><th>list_country</th><th>list_name</th><th>list_SME</th><th>list_city</th><th>list_geolocation</th><th>list_organizationURL</th><th>list_role</th><th>list_ecContribution</th><th>list_netEcContribution</th><th>list_totalCost</th><th>list_endOfParticipation</th><th>list_activityType</th><th>list_contentUpdateDate_org</th><th>list_physUrl</th><th>list_availableLanguages</th><th>list_status</th><th>list_archivedDate</th><th>list_type</th><th>list_source</th><th>list_represents</th><th>list_legalBasis</th><th>list_title_legal</th><th>list_uniqueProgrammePart</th><th>list_topic</th><th>list_title_topic</th><th>list_euroSciVocTitle</th><th>list_euroSciVocPath</th><th>list_description</th><th>netEcContribution</th><th>startYear</th><th>endYear</th><th>durationDays</th><th>ecRatio</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>date</td><td>date</td><td>f64</td><td>f64</td><td>str</td><td>str</td><td>date</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>datetime[μs]</td><td>i64</td><td>str</td><td>str</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[f64]</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>f64</td><td>i32</td><td>i32</td><td>i64</td><td>f64</td></tr></thead><tbody><tr><td>"624794"</td><td>"COMPACTABILITY"</td><td>"CLOSED"</td><td>"Contribution of Compact Neighb…</td><td>2014-12-01</td><td>2016-11-30</td><td>309235.2</td><td>309235.2</td><td>"FP7-PEOPLE"</td><td>"FP7-PEOPLE-2013-IEF"</td><td>null</td><td>"FP7"</td><td>null</td><td>"FP7-PEOPLE-2013-IEF"</td><td>"MC-IEF"</td><td>null</td><td>"This research investigates how…</td><td>2017-04-10 11:25:29</td><td>187874</td><td>null</td><td>"H2013"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["Final Report Summary - COMPACTABILITY (Contribution of Compact Neighbourhoods to Social Sustainability)"]</td><td>["/docs/results/624/624794/final1-table-1.jpg"]</td><td>[2017-03-07 17:25:15]</td><td>["999446873"]</td><td>["UK"]</td><td>["OXFORD BROOKES UNIVERSITY"]</td><td>[null]</td><td>["Oxford"]</td><td>["51.7520131,-1.2578498"]</td><td>["http://www.brookes.ac.uk"]</td><td>["coordinator"]</td><td>["309235.2"]</td><td>[null]</td><td>[null]</td><td>["false"]</td><td>["HES"]</td><td>[2017-04-10 11:25:29]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["FP7-PEOPLE"]</td><td>["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]</td><td>[null]</td><td>["FP7-PEOPLE-2013-IEF"]</td><td>["Marie-Curie Action: Intra-European fellowships for career development"]</td><td>null</td><td>null</td><td>null</td><td>0.0</td><td>2014</td><td>2016</td><td>730</td><td>0.0</td></tr><tr><td>"276810"</td><td>"ARCHOSL"</td><td>"CLOSED"</td><td>"Archives of Early Human Occupa…</td><td>2011-03-01</td><td>2014-02-28</td><td>75000.0</td><td>75000.0</td><td>"FP7-PEOPLE"</td><td>"FP7-PEOPLE-2009-RG"</td><td>null</td><td>"FP7"</td><td>null</td><td>"FP7-PEOPLE-2010-RG"</td><td>"MC-IRG"</td><td>null</td><td>"A number of important archaeol…</td><td>2019-08-02 13:24:51</td><td>98178</td><td>null</td><td>"H2013"</td><td>null</td><td>null</td><td>null</td><td>["Arnold, L.J., Demuro, M., Parés, J.M., Arsuaga, J.L., Aranburu, A.,", "Lee J. Arnold , Martina Demuro , Marta Navazo , Alfonso Benito-Calvo , Alfredo Pérez-González", … "F. Gutiérrez , B. Valero-Garcés , G. Desir , P. González-Sampériz , M. Gutiérrez , R. Linares , M. Zarroca , A. Moreno , J. Guerrero , C. Roqué"]</td><td>["Luminescence dating and palaeomagnetic age constraint on hominins from Sima de los Huesos, Atapuerca, Spain", "OSL dating of the Middle Palaeolithic Hotel California site, Sierra de Atapuerca, north-central Spain", … "Late Holocene evolution of playa lakes in the central Ebro depression based on geophysical surveys and morpho-stratigraphic analysis of lacustrine terraces"]</td><td>["http://dx.doi.org/10.1016/j.jhevol.2013.12.001", "10.1111/j.1502-3885.2012.00262.x", … "http://dx.doi.org/10.1016/j.geomorph.2012.02.013"]</td><td>["Journal of Human Evolution", "Boreas", … "Geomorphology"]</td><td>["PEER REVIEWED ARTICLE", "PEER REVIEWED ARTICLE", … "PEER REVIEWED ARTICLE"]</td><td>[null, null, … null]</td><td>[null, null, … null]</td><td>["Final Report Summary - ARCHOSL (Archives of Early Human Occupation in Western Europe: OSL Chronologies beyond the Middle Pleistocene in the Iberian Peninsula)"]</td><td>[null]</td><td>[2014-11-07 13:26:06]</td><td>["986579241"]</td><td>["ES"]</td><td>["CENTRO NACIONAL DE INVESTIGACION SOBRE LA EVOLUCION HUMANA"]</td><td>[null]</td><td>["Burgos"]</td><td>["42.3396185,-3.6967044"]</td><td>["http://www.cenieh.es"]</td><td>["coordinator"]</td><td>["75000"]</td><td>[null]</td><td>[null]</td><td>["false"]</td><td>["REC"]</td><td>[2019-08-02 13:24:51]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["FP7-PEOPLE"]</td><td>["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]</td><td>[null]</td><td>["FP7-PEOPLE-2009-RG"]</td><td>["Marie Curie Action: Reintegration Grants"]</td><td>["ethnoarchaeology", "physical anthropology"]</td><td>["/humanities/history and archaeology/archaeology/ethnoarchaeology", "/social sciences/sociology/anthropology/physical anthropology"]</td><td>[null, null]</td><td>0.0</td><td>2011</td><td>2014</td><td>1095</td><td>0.0</td></tr><tr><td>"622478"</td><td>"DETforDRF 2.0"</td><td>"CLOSED"</td><td>"Design and Expansion Turbine f…</td><td>null</td><td>null</td><td>161968.8</td><td>161968.8</td><td>"FP7-PEOPLE"</td><td>"FP7-PEOPLE-2013-IEF"</td><td>null</td><td>"FP7"</td><td>null</td><td>"FP7-PEOPLE-2013-IEF"</td><td>"MC-IEF"</td><td>null</td><td>"This proposal for a Marie Curi…</td><td>2016-03-31 21:10:31</td><td>187686</td><td>null</td><td>"H2013"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["953573536"]</td><td>["DE"]</td><td>["BSH HAUSGERATE GMBH"]</td><td>[null]</td><td>["Munchen"]</td><td>["48.0887063,11.6433468"]</td><td>["http://www.bsh-group.com"]</td><td>["coordinator"]</td><td>["161968.8"]</td><td>[null]</td><td>[null]</td><td>["false"]</td><td>["PRC"]</td><td>[2016-03-31 21:10:31]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["FP7-PEOPLE"]</td><td>["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]</td><td>[null]</td><td>["FP7-PEOPLE-2013-IEF"]</td><td>["Marie-Curie Action: Intra-European fellowships for career development"]</td><td>["fluid dynamics"]</td><td>["/natural sciences/physical sciences/classical mechanics/fluid mechanics/fluid dynamics"]</td><td>[null]</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>0.0</td></tr><tr><td>"615785"</td><td>"EMERGING SUBJECTS"</td><td>"CLOSED"</td><td>"Emerging Subjects of the New E…</td><td>2014-09-01</td><td>2019-06-30</td><td>1.658373e6</td><td>1.658373e6</td><td>"FP7-IDEAS-ERC"</td><td>"ERC-CG-2013-SH2"</td><td>null</td><td>"FP7"</td><td>null</td><td>"ERC-2013-CoG"</td><td>"ERC-CG"</td><td>null</td><td>"This project examines how pred…</td><td>2023-04-05 11:40:06</td><td>188675</td><td>null</td><td>"H2013"</td><td>null</td><td>null</td><td>null</td><td>["Rebekah Plueckhahn", "Dulam, Bumochir", … "•Empson, R. A."]</td><td>["Tragic Spirits: Shamanism, Memory, and Gender in Contemporary Mongolia by Manduhai Buyandelger.", "The Afterlife of Nomadism: Pastoralism, environmentalism, civilization and identity in Mongolia and China", … "A Space That Will Never Be Filled Sharp Communication and the Simultaneity of Opposites."]</td><td>["10.1111/aman.12304", null, … null]</td><td>["American Anthropologist", "Pastoralist Livelihoods in Asian Drylands: Environment, Governance and Risk", … "Current Anthropology"]</td><td>["PEER_REVIEWED_ARTICLE", "ARTICLE", … "ARTICLE"]</td><td>[null, null, … null]</td><td>[null, null, … null]</td><td>["Final Report Summary - EMERGING SUBJECTS (Emerging Subjects of the New Economy: Tracing Economic Growth in Mongolia)"]</td><td>[null]</td><td>[2018-01-15 17:25:25]</td><td>["888898146"]</td><td>[null]</td><td>["UNIVERSITY COLLEGE LONDON"]</td><td>[null]</td><td>["LONDON"]</td><td>["51.5236746,-0.1339608"]</td><td>["http://www.ucl.ac.uk"]</td><td>["coordinator"]</td><td>["1658373"]</td><td>[null]</td><td>[null]</td><td>["false"]</td><td>["HES"]</td><td>[2023-04-05 11:40:06]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["FP7-IDEAS-ERC"]</td><td>["Specific programme: "Ideas" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]</td><td>[null]</td><td>["ERC-CG-2013-SH2"]</td><td>["ERC Consolidator Grant - Institutions Values Beliefs and behaviour"]</td><td>["anthropology"]</td><td>["/social sciences/sociology/anthropology"]</td><td>[null]</td><td>0.0</td><td>2014</td><td>2019</td><td>1763</td><td>0.0</td></tr><tr><td>"237010"</td><td>"DEER PALAEOBIOLOGY"</td><td>"CLOSED"</td><td>"Palaeobiological inference thr…</td><td>2009-04-09</td><td>2011-01-08</td><td>173416.47</td><td>173416.47</td><td>"FP7-PEOPLE"</td><td>"FP7-PEOPLE-IEF-2008"</td><td>null</td><td>"FP7"</td><td>null</td><td>"FP7-PEOPLE-IEF-2008"</td><td>"MC-IEF"</td><td>null</td><td>"The present research aims to r…</td><td>2019-07-16 19:18:25</td><td>90424</td><td>null</td><td>"H2013"</td><td>null</td><td>null</td><td>null</td><td>["Lister, A.M., Breda, M. and others", "Breda, M., Lister, A.M. & others"]</td><td>["Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. II. Cervidae, Equidae and Suidae.", "Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. I: Rhinocerotidae and Bovidae."]</td><td>[null, null]</td><td>["Quaternary International", "Quaternary International"]</td><td>["PEER REVIEWED ARTICLE", "PEER REVIEWED ARTICLE"]</td><td>[null, null]</td><td>[null, null]</td><td>["Final Report Summary - DEER PALAEOBIOLOGY (Palaeobiological inference through phylogenetic analysis of Pleistocene deer)"]</td><td>[null]</td><td>[2013-07-05 00:02:53]</td><td>["999642037"]</td><td>["UK"]</td><td>["NATURAL HISTORY MUSEUM"]</td><td>[null]</td><td>["London"]</td><td>["51.494882,-0.1847716"]</td><td>["http://www.nhm.ac.uk/"]</td><td>["coordinator"]</td><td>["173416.47"]</td><td>[null]</td><td>[null]</td><td>["false"]</td><td>["PUB"]</td><td>[2019-07-16 19:18:25]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>["FP7-PEOPLE"]</td><td>["Specific programme "People" implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)"]</td><td>[null]</td><td>["FP7-PEOPLE-IEF-2008"]</td><td>["Marie Curie Action: Intra-European Fellowships for Career Development"]</td><td>["comparative morphology"]</td><td>["/natural sciences/biological sciences/biological morphology/comparative morphology"]</td><td>[null]</td><td>0.0</td><td>2009</td><td>2011</td><td>639</td><td>0.0</td></tr></tbody></table></div>"
|
631 |
+
],
|
632 |
+
"text/plain": [
|
633 |
+
"shape: (5, 68)\n",
|
634 |
+
"┌────────┬──────────────┬────────┬──────────────┬───┬───────────┬─────────┬──────────────┬─────────┐\n",
|
635 |
+
"│ id ┆ acronym ┆ status ┆ title ┆ … ┆ startYear ┆ endYear ┆ durationDays ┆ ecRatio │\n",
|
636 |
+
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
|
637 |
+
"│ str ┆ str ┆ str ┆ str ┆ ┆ i32 ┆ i32 ┆ i64 ┆ f64 │\n",
|
638 |
+
"╞════════╪══════════════╪════════╪══════════════╪═══╪═══════════╪═════════╪══════════════╪═════════╡\n",
|
639 |
+
"│ 624794 ┆ COMPACTABILI ┆ CLOSED ┆ Contribution ┆ … ┆ 2014 ┆ 2016 ┆ 730 ┆ 0.0 │\n",
|
640 |
+
"│ ┆ TY ┆ ┆ of Compact ┆ ┆ ┆ ┆ ┆ │\n",
|
641 |
+
"│ ┆ ┆ ┆ Neighb… ┆ ┆ ┆ ┆ ┆ │\n",
|
642 |
+
"│ 276810 ┆ ARCHOSL ┆ CLOSED ┆ Archives of ┆ … ┆ 2011 ┆ 2014 ┆ 1095 ┆ 0.0 │\n",
|
643 |
+
"│ ┆ ┆ ┆ Early Human ┆ ┆ ┆ ┆ ┆ │\n",
|
644 |
+
"│ ┆ ┆ ┆ Occupa… ┆ ┆ ┆ ┆ ┆ │\n",
|
645 |
+
"│ 622478 ┆ DETforDRF ┆ CLOSED ┆ Design and ┆ … ┆ null ┆ null ┆ null ┆ 0.0 │\n",
|
646 |
+
"│ ┆ 2.0 ┆ ┆ Expansion ┆ ┆ ┆ ┆ ┆ │\n",
|
647 |
+
"│ ┆ ┆ ┆ Turbine f… ┆ ┆ ┆ ┆ ┆ │\n",
|
648 |
+
"│ 615785 ┆ EMERGING ┆ CLOSED ┆ Emerging ┆ … ┆ 2014 ┆ 2019 ┆ 1763 ┆ 0.0 │\n",
|
649 |
+
"│ ┆ SUBJECTS ┆ ┆ Subjects of ┆ ┆ ┆ ┆ ┆ │\n",
|
650 |
+
"│ ┆ ┆ ┆ the New E… ┆ ┆ ┆ ┆ ┆ │\n",
|
651 |
+
"│ 237010 ┆ DEER PALAEOB ┆ CLOSED ┆ Palaeobiolog ┆ … ┆ 2009 ┆ 2011 ┆ 639 ┆ 0.0 │\n",
|
652 |
+
"│ ┆ IOLOGY ┆ ┆ ical ┆ ┆ ┆ ┆ ┆ │\n",
|
653 |
+
"│ ┆ ┆ ┆ inference ┆ ┆ ┆ ┆ ┆ │\n",
|
654 |
+
"│ ┆ ┆ ┆ thr… ┆ ┆ ┆ ┆ ┆ │\n",
|
655 |
+
"└────────┴──────────────┴────────┴──────────────┴───┴───────────┴─────────┴──────────────┴─────────┘"
|
656 |
+
]
|
657 |
+
},
|
658 |
+
"execution_count": 3,
|
659 |
+
"metadata": {},
|
660 |
+
"output_type": "execute_result"
|
661 |
+
}
|
662 |
+
],
|
663 |
+
"source": [
|
664 |
+
"consolidated_clean.head()\n",
|
665 |
+
"#ecMaxContribution, endDate, status, legalBasis, frameworkProgramme, fundingScheme, list_title_report, list_name, list_role, list_city, list_country, list_ecContribution, list_activityType, durationDays"
|
666 |
+
]
|
667 |
+
},
|
668 |
{
|
669 |
"cell_type": "markdown",
|
670 |
"metadata": {},
|
backend/main.py
CHANGED
@@ -9,10 +9,12 @@ from pydantic import BaseModel
|
|
9 |
#except:
|
10 |
# from .rag import get_rag_chain, RAGRequest, RAGResponse
|
11 |
from contextlib import asynccontextmanager
|
12 |
-
|
13 |
import polars as pl
|
14 |
import gcsfs
|
15 |
|
|
|
|
|
16 |
@asynccontextmanager
|
17 |
async def lifespan(app: FastAPI):
|
18 |
bucket = "mda_eu_project"
|
@@ -121,29 +123,30 @@ def get_stats(request: Request):
|
|
121 |
@app.get("/api/project/{project_id}/organizations")
|
122 |
def get_project_organizations(project_id: str):
|
123 |
df = app.state.df
|
|
|
124 |
sel = df.filter(pl.col("id") == project_id)
|
125 |
if sel.is_empty():
|
126 |
raise HTTPException(status_code=404, detail="Project not found")
|
127 |
|
128 |
orgs_df = (
|
129 |
-
sel
|
130 |
-
|
131 |
-
pl.
|
132 |
-
pl.
|
|
|
133 |
])
|
134 |
.with_columns([
|
135 |
-
|
136 |
-
|
137 |
-
.alias("latlon")
|
138 |
])
|
139 |
.with_columns([
|
140 |
-
pl.col("latlon").list.get(0).cast(
|
141 |
-
pl.col("latlon").list.get(1).cast(
|
142 |
])
|
143 |
.filter(pl.col("name").is_not_null())
|
144 |
-
.select(["name","country","latitude","longitude"])
|
145 |
)
|
146 |
-
|
147 |
return orgs_df.to_dicts()
|
148 |
|
149 |
"""def rag_chain_depender():
|
|
|
9 |
#except:
|
10 |
# from .rag import get_rag_chain, RAGRequest, RAGResponse
|
11 |
from contextlib import asynccontextmanager
|
12 |
+
import os
|
13 |
import polars as pl
|
14 |
import gcsfs
|
15 |
|
16 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\Romain\OneDrive - KU Leuven\focal-pager-460414-e9-45369b738be0.json"
|
17 |
+
|
18 |
@asynccontextmanager
|
19 |
async def lifespan(app: FastAPI):
|
20 |
bucket = "mda_eu_project"
|
|
|
123 |
@app.get("/api/project/{project_id}/organizations")
|
124 |
def get_project_organizations(project_id: str):
|
125 |
df = app.state.df
|
126 |
+
|
127 |
sel = df.filter(pl.col("id") == project_id)
|
128 |
if sel.is_empty():
|
129 |
raise HTTPException(status_code=404, detail="Project not found")
|
130 |
|
131 |
orgs_df = (
|
132 |
+
sel
|
133 |
+
.select([
|
134 |
+
pl.col("list_name").explode().alias("name"),
|
135 |
+
pl.col("list_country").explode().alias("country"),
|
136 |
+
pl.col("list_geolocation").explode().alias("geoloc"),
|
137 |
])
|
138 |
.with_columns([
|
139 |
+
# now this is a List(Utf8)
|
140 |
+
pl.col("geoloc").str.split(",").alias("latlon"),
|
|
|
141 |
])
|
142 |
.with_columns([
|
143 |
+
pl.col("latlon").list.get(0).cast(pl.Float64).alias("latitude"),
|
144 |
+
pl.col("latlon").list.get(1).cast(pl.Float64).alias("longitude"),
|
145 |
])
|
146 |
.filter(pl.col("name").is_not_null())
|
147 |
+
.select(["name", "country", "latitude", "longitude"])
|
148 |
)
|
149 |
+
|
150 |
return orgs_df.to_dicts()
|
151 |
|
152 |
"""def rag_chain_depender():
|
data_enhancement.py
ADDED
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import csv
|
2 |
+
import re
|
3 |
+
import polars as pl
|
4 |
+
from __future__ import annotations
|
5 |
+
import re, csv, pathlib, polars as pl
|
6 |
+
|
7 |
+
ROOT = pathlib.Path(r"C:\Users\Romain\OneDrive - KU Leuven\Masters\MBIS\Year 2\Semester 2\Modern Data Analytics\CORDIS")
|
8 |
+
DATASETS = [
|
9 |
+
"project",
|
10 |
+
"projectDeliverables",
|
11 |
+
"projectPublications",
|
12 |
+
"reportSummaries",
|
13 |
+
"organization",
|
14 |
+
"euroSciVoc",
|
15 |
+
"topics",
|
16 |
+
"webItem",
|
17 |
+
"webLink",
|
18 |
+
"legalBasis",
|
19 |
+
]
|
20 |
+
OUTDIR = ROOT / "combined"
|
21 |
+
OUTDIR.mkdir(exist_ok=True)
|
22 |
+
|
23 |
+
###############################################################################
|
24 |
+
# 2. Generic cleaner –– parameterised version of the loop you wrote
|
25 |
+
###############################################################################
|
26 |
+
_PROJECT_ID_RE = re.compile(r"^(?:19|20)\d{2}")
|
27 |
+
_GENERIC_NUM_RE = re.compile(r"\d{4}")
|
28 |
+
|
29 |
+
import csv, pathlib, polars as pl, re
|
30 |
+
|
31 |
+
import csv, re, pathlib
|
32 |
+
import polars as pl # >=0.20
|
33 |
+
|
34 |
+
import csv, pathlib, re
|
35 |
+
import polars as pl # ≥ 0.20
|
36 |
+
|
37 |
+
|
38 |
+
def _clean_one_file(csv_path: pathlib.Path,
|
39 |
+
number_regex: re.Pattern[str], dataset: str) -> pl.DataFrame:
|
40 |
+
"""
|
41 |
+
Clean a CORDIS CSV whose long *objective* field sometimes explodes into
|
42 |
+
extra columns because of stray quotes / semicolons.
|
43 |
+
|
44 |
+
Strategy
|
45 |
+
--------
|
46 |
+
* A well-formed row has 21 semicolon-separated columns.
|
47 |
+
* If we get more than 21 columns we treat columns 16 … -4 as belonging
|
48 |
+
to *objective* and stitch them back together with a semicolon.
|
49 |
+
* The last three columns are contentUpdateDate | rcn | grantDoi.
|
50 |
+
"""
|
51 |
+
# ---------- constants --------------------------------------------------
|
52 |
+
if dataset=="project":
|
53 |
+
EXPECTED_COLS = 20 # final width
|
54 |
+
TITLE_COL = 3 # 0-based index of *title*
|
55 |
+
DATE1_COL = 4 # 0-based index of startDate
|
56 |
+
DATE2_COL = 5 # 0-based index of endDate
|
57 |
+
OBJECTIVE_COL = 16 # 0-based index of objective
|
58 |
+
TRAILING_KEEP = 3 # last three fixed columns
|
59 |
+
elif dataset=="organization":
|
60 |
+
EXPECTED_COLS = 25 # final width
|
61 |
+
TITLE_COL = 3 # 0-based index of *title*
|
62 |
+
DATE1_COL = 4 # 0-based index of startDate
|
63 |
+
DATE2_COL = 5 # 0-based index of endDate
|
64 |
+
OBJECTIVE_COL = 4 # 0-based index of objective
|
65 |
+
TRAILING_KEEP = 20 # last three fixed columns
|
66 |
+
else:
|
67 |
+
EXPECTED_COLS = 20 # final width
|
68 |
+
TITLE_COL = 3 # 0-based index of *title*
|
69 |
+
DATE1_COL = 4 # 0-based index of startDate
|
70 |
+
DATE2_COL = 5 # 0-based index of endDate
|
71 |
+
OBJECTIVE_COL = 16 # 0-based index of objective
|
72 |
+
TRAILING_KEEP = 3 # last three fixed columns
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
date_rx = re.compile(r"\d{4}-\d{2}-\d{2}$")
|
77 |
+
is_date = lambda s: (s == "") or bool(date_rx.match(s))
|
78 |
+
|
79 |
+
tmp_clean = csv_path.with_suffix(".cleaned.csv")
|
80 |
+
|
81 |
+
with csv_path.open(encoding="utf-8", newline="") as fin, \
|
82 |
+
tmp_clean.open("w", encoding="utf-8", newline="") as fout:
|
83 |
+
|
84 |
+
writer = csv.writer(
|
85 |
+
fout,
|
86 |
+
delimiter="|",
|
87 |
+
quotechar='"',
|
88 |
+
quoting=csv.QUOTE_MINIMAL,
|
89 |
+
lineterminator="\n",
|
90 |
+
)
|
91 |
+
|
92 |
+
# ---------- iterate raw lines -------------------------------------
|
93 |
+
for raw in fin:
|
94 |
+
#print(raw)
|
95 |
+
raw = raw.rstrip("\n")
|
96 |
+
#print(raw)
|
97 |
+
cells = raw.split(";") # blind split
|
98 |
+
|
99 |
+
# ---- 1️⃣ repair *title* if dates are not where they belong --
|
100 |
+
if (len(cells) > EXPECTED_COLS) and (not is_date(cells[DATE1_COL]) or not is_date(cells[DATE2_COL])) and dataset=="project":
|
101 |
+
# look for the first position where *two successive* cells
|
102 |
+
# are both valid dates / nulls
|
103 |
+
i = DATE1_COL
|
104 |
+
while i + 1 < len(cells):
|
105 |
+
if is_date(cells[i]) and is_date(cells[i + 1]):
|
106 |
+
break
|
107 |
+
i += 1
|
108 |
+
else:
|
109 |
+
# cannot find a valid date pair → give up on this line
|
110 |
+
continue
|
111 |
+
|
112 |
+
head = cells[:TITLE_COL] # 0 … 2
|
113 |
+
title = ";".join(cells[TITLE_COL:i]) # glue spill-over
|
114 |
+
cells = head + [title] + cells[i:] # rebuild the row
|
115 |
+
# ---- 2️⃣ repair *objective* overflow ------------------------
|
116 |
+
if len(cells) > EXPECTED_COLS and (dataset=="project" or dataset=="organization"):
|
117 |
+
head = cells[:OBJECTIVE_COL]
|
118 |
+
tail = cells[-TRAILING_KEEP:]
|
119 |
+
obj = ";".join(cells[OBJECTIVE_COL:-TRAILING_KEEP])
|
120 |
+
cells = head + [obj] + tail
|
121 |
+
#print("here 2")
|
122 |
+
|
123 |
+
# ---- 3️⃣ pad short rows, skip malformed ---------------------
|
124 |
+
if len(cells) < EXPECTED_COLS and (dataset=="project" or dataset=="organization"):
|
125 |
+
cells.extend([""] * (EXPECTED_COLS - len(cells)))
|
126 |
+
#print("here again")
|
127 |
+
|
128 |
+
if len(cells) != EXPECTED_COLS and (dataset=="project" or dataset=="organization"): # still wrong → skip
|
129 |
+
#print(cells)
|
130 |
+
continue
|
131 |
+
|
132 |
+
# ---- 4️⃣ cell-level clean-ups -------------------------------
|
133 |
+
cleaned: list[str] = []
|
134 |
+
for cell in cells:
|
135 |
+
|
136 |
+
if cell in ('""', ""):
|
137 |
+
cell = ""
|
138 |
+
else:
|
139 |
+
cell = (cell.replace("\t", " ")
|
140 |
+
.replace('"""', '"')
|
141 |
+
.strip())
|
142 |
+
if number_regex.fullmatch(cell):
|
143 |
+
cell = cell.lstrip("0") or "0"
|
144 |
+
cleaned.append(cell.strip('"'))
|
145 |
+
cleaned[-1]=cleaned[-1].replace('"','').replace(',','')
|
146 |
+
cleaned[0]=cleaned[0].replace('"','')
|
147 |
+
writer.writerow(cleaned)
|
148 |
+
|
149 |
+
# ---------- read into Polars (all Utf8) -------------------------------
|
150 |
+
return pl.read_csv(
|
151 |
+
tmp_clean,
|
152 |
+
separator="|",
|
153 |
+
quote_char='"',
|
154 |
+
has_header=True,
|
155 |
+
infer_schema_length=0,
|
156 |
+
null_values=[""],
|
157 |
+
truncate_ragged_lines=True,
|
158 |
+
)
|
159 |
+
|
160 |
+
|
161 |
+
def combine_all_programmes() -> None:
|
162 |
+
from pathlib import Path
|
163 |
+
for dataset in DATASETS:
|
164 |
+
combined: list[pl.DataFrame] = []
|
165 |
+
|
166 |
+
for i,programme_dir in enumerate(ROOT.iterdir()):
|
167 |
+
if not programme_dir.is_dir():
|
168 |
+
continue
|
169 |
+
csv_file = programme_dir / f"{dataset}.csv"
|
170 |
+
if not csv_file.exists():
|
171 |
+
continue
|
172 |
+
|
173 |
+
regex = _PROJECT_ID_RE if dataset == "project" else _GENERIC_NUM_RE
|
174 |
+
df = _clean_one_file(csv_file, regex, dataset)
|
175 |
+
print(programme_dir)
|
176 |
+
# ---------- type coercions matching your original code ----------
|
177 |
+
if dataset == "project":
|
178 |
+
df = (
|
179 |
+
df
|
180 |
+
.with_columns([
|
181 |
+
pl.col("id"),#.cast(pl.Int64),
|
182 |
+
pl.col("acronym").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
183 |
+
pl.col("status").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
184 |
+
pl.col("title").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
185 |
+
pl.col("legalBasis").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
186 |
+
pl.col("topics").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
187 |
+
pl.col("frameworkProgramme").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
188 |
+
pl.col("masterCall").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
189 |
+
pl.col("subCall").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
190 |
+
pl.col("fundingScheme").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
191 |
+
pl.col("nature").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
192 |
+
pl.col("objective").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
193 |
+
pl.col("grantDoi").cast(pl.Utf8, strict=False).str.strip_chars('"'),
|
194 |
+
pl.col("totalCost").cast(pl.Utf8, strict=False).str.strip_chars('"').str.replace_all('"','').str.replace(",",".").cast(pl.Float64),
|
195 |
+
pl.col("ecMaxContribution").cast(pl.Utf8, strict=False).str.strip_chars('"').str.replace_all('"','').str.replace(",",".").cast(pl.Float64),
|
196 |
+
pl.col("startDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Date, "%Y-%m-%d", strict=False),
|
197 |
+
pl.col("endDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Date, "%Y-%m-%d", strict=False),
|
198 |
+
pl.col("ecSignatureDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Date, "%Y-%m-%d", strict=False),
|
199 |
+
pl.col("contentUpdateDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
|
200 |
+
pl.col("rcn").cast(pl.Int64),
|
201 |
+
])
|
202 |
+
.with_columns(
|
203 |
+
pl.lit(programme_dir.name).alias("programmeFolder") # <-- NEW COLUMN
|
204 |
+
)
|
205 |
+
)
|
206 |
+
elif dataset == "organization":
|
207 |
+
df = df.with_columns([
|
208 |
+
pl.col("contentUpdateDate").cast(pl.Utf8, strict=False).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
|
209 |
+
pl.col("totalCost").cast(pl.Utf8, strict=False).str.replace(",",".").cast(pl.Float64),
|
210 |
+
])
|
211 |
+
elif dataset == "projectDeliverables":
|
212 |
+
df = df.with_columns([
|
213 |
+
#pl.col("projectID").cast(pl.Int64),
|
214 |
+
pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
|
215 |
+
.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
|
216 |
+
])
|
217 |
+
elif dataset == "projectPublications":
|
218 |
+
if programme_dir==Path(r"C:\Users\Romain\OneDrive - KU Leuven\Masters\MBIS\Year 2\Semester 2\Modern Data Analytics\CORDIS\H2013"):
|
219 |
+
rename_map = {
|
220 |
+
"RECORD_ID": "id",
|
221 |
+
"TITLE": "title",
|
222 |
+
"AUTHOR": "authors",
|
223 |
+
"DOI": "doi",
|
224 |
+
"PROJECT_ID": "projectID",
|
225 |
+
"JOURNAL_TITLE": "journalTitle",
|
226 |
+
"PAGES": "publishedPages",
|
227 |
+
"PUBLICATION_TYPE": "isPublishedAs",
|
228 |
+
}
|
229 |
+
|
230 |
+
df = df.rename(rename_map)
|
231 |
+
else:
|
232 |
+
df = df.with_columns([
|
233 |
+
pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
|
234 |
+
.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
|
235 |
+
pl.col("id").cast(pl.Utf8, strict=False)
|
236 |
+
.str.extract(r"^(\d+)_", 1)
|
237 |
+
#.cast(pl.Int64)
|
238 |
+
.alias("projectID"),
|
239 |
+
])
|
240 |
+
elif dataset == "reportSummaries":
|
241 |
+
df = df.with_columns(
|
242 |
+
pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
|
243 |
+
.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
|
244 |
+
)
|
245 |
+
elif dataset == "organization":
|
246 |
+
df = df.with_columns([
|
247 |
+
pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
|
248 |
+
.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
|
249 |
+
pl.col("totalCost").cast(pl.Utf8, strict=False)
|
250 |
+
.str.replace(",", ".")
|
251 |
+
.cast(pl.Float64),
|
252 |
+
])
|
253 |
+
elif dataset == "webItem":
|
254 |
+
df = df.with_columns(
|
255 |
+
pl.col("uri").cast(pl.Utf8, strict=False)
|
256 |
+
.str.extract(r"/files/\d+/(\d+)/", 1)
|
257 |
+
.cast(pl.Int64)
|
258 |
+
.alias("projectID"),
|
259 |
+
)
|
260 |
+
|
261 |
+
# ---------------------------------------------------------------
|
262 |
+
combined.append(df)
|
263 |
+
|
264 |
+
# --------------------------------------------------------------------
|
265 |
+
# Write out per-dataset parquet
|
266 |
+
# --------------------------------------------------------------------
|
267 |
+
if combined:
|
268 |
+
how="vertical_relaxed"
|
269 |
+
if dataset=="projectPublications":
|
270 |
+
how="diagonal"
|
271 |
+
result = pl.concat(combined, how=how)
|
272 |
+
parquet_path = OUTDIR / f"{dataset}_all.parquet"
|
273 |
+
result.write_parquet(parquet_path)
|
274 |
+
print(f"✔ {dataset:15s} → {parquet_path}")
|
275 |
+
|
276 |
+
import pathlib
|
277 |
+
import polars as pl
|
278 |
+
|
279 |
+
ROOT = pathlib.Path(r"C:\Users\Romain\OneDrive - KU Leuven\Masters\MBIS\Year 2\Semester 2\Modern Data Analytics\CORDIS")
|
280 |
+
OUTDIR = ROOT / "combined"
|
281 |
+
DATASETS = [
|
282 |
+
"project",
|
283 |
+
"projectDeliverables",
|
284 |
+
"projectPublications",
|
285 |
+
"reportSummaries",
|
286 |
+
"organization",
|
287 |
+
"euroSciVoc",
|
288 |
+
"topics",
|
289 |
+
"webItem",
|
290 |
+
"webLink",
|
291 |
+
"legalBasis",
|
292 |
+
]
|
293 |
+
|
294 |
+
dfs = {}
|
295 |
+
for dataset in DATASETS:
|
296 |
+
path = OUTDIR / f"{dataset}_all.parquet"
|
297 |
+
dfs[dataset] = pl.read_parquet(path)
|
298 |
+
|
299 |
+
projects = dfs["project"]
|
300 |
+
|
301 |
+
projects_deliv = (
|
302 |
+
dfs["projectDeliverables"]
|
303 |
+
.group_by("projectID")
|
304 |
+
.agg([
|
305 |
+
pl.col("deliverableType").alias("list_deliverableType"),
|
306 |
+
pl.col("url") .alias("list_url"),
|
307 |
+
pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
|
308 |
+
])
|
309 |
+
)
|
310 |
+
|
311 |
+
projects_publi = (
|
312 |
+
dfs["projectPublications"]
|
313 |
+
.group_by("projectID")
|
314 |
+
.agg([
|
315 |
+
pl.col("authors") .alias("list_authors"),
|
316 |
+
pl.col("title") .alias("list_title"),
|
317 |
+
pl.col("doi") .alias("list_doi"),
|
318 |
+
pl.col("journalTitle") .alias("list_journalTitle"),
|
319 |
+
pl.col("isPublishedAs") .alias("list_isPublishedAs"),
|
320 |
+
pl.col("publishedYear") .alias("list_publishedYear"),
|
321 |
+
pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
|
322 |
+
])
|
323 |
+
)
|
324 |
+
|
325 |
+
report = (
|
326 |
+
dfs["reportSummaries"]
|
327 |
+
.group_by("projectID")
|
328 |
+
.agg([
|
329 |
+
pl.col("title") .alias("list_title"),
|
330 |
+
pl.col("attachment") .alias("list_attachment"),
|
331 |
+
pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
|
332 |
+
])
|
333 |
+
)
|
334 |
+
|
335 |
+
org = (
|
336 |
+
dfs["organization"]
|
337 |
+
.group_by("projectID")
|
338 |
+
.agg([
|
339 |
+
pl.col("organisationID") .alias("list_organisationID"),
|
340 |
+
pl.col("country") .alias("list_country"),
|
341 |
+
pl.col("name") .alias("list_name"),
|
342 |
+
pl.col("SME") .alias("list_SME"),
|
343 |
+
pl.col("city") .alias("list_city"),
|
344 |
+
pl.col("geolocation") .alias("list_geolocation"),
|
345 |
+
pl.col("organizationURL") .alias("list_organizationURL"),
|
346 |
+
pl.col("role") .alias("list_role"),
|
347 |
+
pl.col("ecContribution") .alias("list_ecContribution"),
|
348 |
+
pl.col("netEcContribution").alias("list_netEcContribution"),
|
349 |
+
pl.col("totalCost") .alias("list_totalCost"),
|
350 |
+
pl.col("endOfParticipation").alias("list_endOfParticipation"),
|
351 |
+
pl.col("activityType") .alias("list_activityType"),
|
352 |
+
pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
|
353 |
+
])
|
354 |
+
)
|
355 |
+
|
356 |
+
voc = (
|
357 |
+
dfs["euroSciVoc"]
|
358 |
+
.group_by("projectID")
|
359 |
+
.agg([
|
360 |
+
pl.col("euroSciVocTitle") .alias("list_euroSciVocTitle"),
|
361 |
+
pl.col("euroSciVocPath") .alias("list_euroSciVocPath"),
|
362 |
+
pl.col("euroSciVocDescription").alias("list_description"),
|
363 |
+
])
|
364 |
+
)
|
365 |
+
|
366 |
+
topic = (
|
367 |
+
dfs["topics"]
|
368 |
+
.group_by("projectID")
|
369 |
+
.agg([
|
370 |
+
pl.col("topic") .alias("list_topic"),
|
371 |
+
pl.col("title") .alias("list_title"),
|
372 |
+
])
|
373 |
+
)
|
374 |
+
|
375 |
+
web_item = dfs["webItem"] # no aggregation
|
376 |
+
|
377 |
+
web_link = (
|
378 |
+
dfs["webLink"]
|
379 |
+
.group_by("projectID")
|
380 |
+
.agg([
|
381 |
+
pl.col("physUrl") .alias("list_physUrl"),
|
382 |
+
pl.col("availableLanguages") .alias("list_availableLanguages"),
|
383 |
+
pl.col("status") .alias("list_status"),
|
384 |
+
pl.col("archivedDate") .alias("list_archivedDate"),
|
385 |
+
pl.col("type") .alias("list_type"),
|
386 |
+
pl.col("source") .alias("list_source"),
|
387 |
+
pl.col("represents") .alias("list_represents"),
|
388 |
+
])
|
389 |
+
)
|
390 |
+
|
391 |
+
legal = (
|
392 |
+
dfs["legalBasis"]
|
393 |
+
.group_by("projectID")
|
394 |
+
.agg([
|
395 |
+
pl.col("legalBasis") .alias("list_legalBasis"),
|
396 |
+
pl.col("title") .alias("list_title"),
|
397 |
+
pl.col("uniqueProgrammePart").alias("list_uniqueProgrammePart"),
|
398 |
+
])
|
399 |
+
)
|
400 |
+
|
401 |
+
consolidated = (
|
402 |
+
projects
|
403 |
+
.join(projects_deliv, left_on="id", right_on="projectID", suffix="_deliv", how="left")
|
404 |
+
.join(projects_publi, left_on="id", right_on="projectID", suffix="_publi", how="left")
|
405 |
+
.join(report, left_on="id", right_on="projectID", suffix="_report", how="left")
|
406 |
+
.join(org, left_on="id", right_on="projectID", suffix="_org", how="left")
|
407 |
+
.join(web_link, left_on="id", right_on="projectID", suffix="_link", how="left")
|
408 |
+
.join(legal, left_on="id", right_on="projectID", suffix="_legal", how="left")
|
409 |
+
.join(topic, left_on="id", right_on="projectID", suffix="_topic", how="left")
|
410 |
+
.join(voc, left_on="id", right_on="projectID", suffix="_voc", how="left")
|
411 |
+
)
|
412 |
+
|
413 |
+
for col in ["startDate", "endDate"]:
|
414 |
+
if consolidated[col].dtype == pl.Utf8:
|
415 |
+
consolidated = consolidated.with_column(
|
416 |
+
pl.col(col).str.strptime(pl.Date, "%Y-%m-%d").alias(col)
|
417 |
+
)
|
418 |
+
|
419 |
+
consolidated = consolidated.with_columns(
|
420 |
+
pl.col("list_netEcContribution").list.eval(pl.element().cast(pl.Float64),parallel=True)
|
421 |
+
.list.sum().alias("netEcContribution")
|
422 |
+
)
|
423 |
+
|
424 |
+
consolidated = consolidated.with_columns(
|
425 |
+
pl.col("totalCost").cast(pl.Float64),
|
426 |
+
pl.col("netEcContribution").cast(pl.Float64)
|
427 |
+
)
|
428 |
+
|
429 |
+
consolidated = consolidated.with_columns([
|
430 |
+
pl.col("startDate").dt.year().alias("startYear"),
|
431 |
+
pl.col("endDate"). dt.year().alias("endYear"),
|
432 |
+
(pl.col("endDate") - pl.col("startDate")).dt.total_days().alias("durationDays"),
|
433 |
+
(pl.col("netEcContribution") / pl.col("totalCost")).alias("ecRatio"),
|
434 |
+
])
|
435 |
+
|
436 |
+
consolidated.write_parquet(OUTDIR / "consolidated.parquet")
|
437 |
+
|
438 |
+
excluded_frameworks = ["FP1", "FP2", "FP3", "FP4", "FP5", "FP6"]
|
439 |
+
|
440 |
+
consolidated_clean = (consolidated.filter(~pl.col("frameworkProgramme").is_in(excluded_frameworks)))
|
441 |
+
|
442 |
+
consolidated_clean.write_parquet(OUTDIR / "consolidated_clean.parquet")
|
predictive_modelling.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import joblib
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import shap
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import scipy.sparse
|
9 |
+
|
10 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
11 |
+
from sklearn.pipeline import Pipeline as SKPipeline
|
12 |
+
from sklearn.compose import ColumnTransformer
|
13 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
|
14 |
+
from sklearn.impute import SimpleImputer
|
15 |
+
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
|
16 |
+
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
|
17 |
+
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, f1_score, make_scorer
|
18 |
+
from sklearn.decomposition import TruncatedSVD
|
19 |
+
from sklearn.calibration import CalibratedClassifierCV
|
20 |
+
from sklearn.ensemble import IsolationForest
|
21 |
+
|
22 |
+
from imblearn.pipeline import Pipeline as ImbPipeline
|
23 |
+
from imblearn.over_sampling import ADASYN
|
24 |
+
|
25 |
+
from sentence_transformers import SentenceTransformer
|
26 |
+
from xgboost import XGBClassifier
|
27 |
+
|
28 |
+
from evidently import Report
|
29 |
+
from evidently.presets import DataDriftPreset
|
30 |
+
|
31 |
+
import optuna
|
32 |
+
|
33 |
+
# --- Custom Transformers ---
|
34 |
+
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
|
35 |
+
def fit(self, X,y=None):
|
36 |
+
self.col = X.name
|
37 |
+
self.mlb = MultiLabelBinarizer()
|
38 |
+
self.mlb.fit(X)
|
39 |
+
return self
|
40 |
+
def transform(self, X):
|
41 |
+
return self.mlb.transform(X)
|
42 |
+
def get_feature_names_out(self, input_features=None):
|
43 |
+
return [f"{self.col}_{cls}" for cls in self.mlb.classes_]
|
44 |
+
def get_params(self, deep=True):
|
45 |
+
return {}
|
46 |
+
def set_params(self, **params):
|
47 |
+
return self
|
48 |
+
|
49 |
+
class AnomalyScoreTransformer(BaseEstimator, TransformerMixin):
|
50 |
+
def __init__(self):
|
51 |
+
self.model = IsolationForest(n_estimators=200, contamination=0.1, random_state=42)
|
52 |
+
|
53 |
+
def fit(self, X, y=None):
|
54 |
+
self.model.fit(X)
|
55 |
+
return self
|
56 |
+
|
57 |
+
def transform(self, X):
|
58 |
+
scores = -self.model.decision_function(X)
|
59 |
+
return np.hstack([X, scores.reshape(-1, 1)])
|
60 |
+
|
61 |
+
# --- Step 1: Data Preparation ---
|
62 |
+
def prepare_data(df, is_train=True, model_dir="model_artifacts"):
|
63 |
+
df = df.copy()
|
64 |
+
|
65 |
+
if is_train:
|
66 |
+
df['status'] = df['status'].astype(str).str.upper()
|
67 |
+
df = df[df['status'].isin(['CLOSED', 'TERMINATED'])]
|
68 |
+
df['label'] = df['status'].map({'CLOSED': 0, 'TERMINATED': 1})
|
69 |
+
assert df['label'].notna().all(), "Label column still has NaNs!"
|
70 |
+
|
71 |
+
multilabel_fields = [
|
72 |
+
'list_country', 'list_activityType', 'list_deliverableType',
|
73 |
+
'list_availableLanguages', 'list_euroSciVocTitle'
|
74 |
+
]
|
75 |
+
|
76 |
+
def extract_intermediate_levels(paths):
|
77 |
+
tokens = []
|
78 |
+
if isinstance(paths, list):
|
79 |
+
for p in paths:
|
80 |
+
parts = p.strip('/').split('/')
|
81 |
+
tokens.extend(parts[:-1])
|
82 |
+
return list(set(tokens))
|
83 |
+
df['euroSciVoc_intermediate'] = df['list_euroSciVocPath'].apply(extract_intermediate_levels)
|
84 |
+
multilabel_fields.append('euroSciVoc_intermediate')
|
85 |
+
|
86 |
+
for col in multilabel_fields:
|
87 |
+
df[col] = df[col].apply(lambda x: [] if x is None else (x.tolist() if hasattr(x, 'tolist') else x))
|
88 |
+
df[col] = df[col].apply(lambda x: list(x) if not isinstance(x, list) else x)
|
89 |
+
df[col] = df[col].apply(lambda x: [item for item in x if item is not None])
|
90 |
+
df[col] = df[col].apply(lambda x: [str(item).upper() for item in x])
|
91 |
+
|
92 |
+
|
93 |
+
def split_languages(lang_list):
|
94 |
+
if not isinstance(lang_list, list):
|
95 |
+
return []
|
96 |
+
result = []
|
97 |
+
for entry in lang_list:
|
98 |
+
if isinstance(entry, str):
|
99 |
+
result.extend(entry.split(","))
|
100 |
+
return result
|
101 |
+
|
102 |
+
df["list_availableLanguages"] = df["list_availableLanguages"].apply(split_languages)
|
103 |
+
|
104 |
+
|
105 |
+
for col in ['title', 'objective']:
|
106 |
+
df[col] = df[col].fillna("").astype(str)
|
107 |
+
|
108 |
+
df['n_partners'] = df['list_name'].apply(
|
109 |
+
lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)
|
110 |
+
)
|
111 |
+
|
112 |
+
df['n_country'] = df['list_country'].apply(
|
113 |
+
lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)
|
114 |
+
)
|
115 |
+
|
116 |
+
df['n_sme'] = df['list_SME'].apply(
|
117 |
+
lambda x: sum(1 for i in (x.tolist() if hasattr(x, 'tolist') else x) if i is True)
|
118 |
+
if x is not None and (hasattr(x, 'tolist') or isinstance(x, list)) else 0
|
119 |
+
)
|
120 |
+
|
121 |
+
return df
|
122 |
+
|
123 |
+
# --- Step 2: Text Embedding ---
|
124 |
+
def compute_embeddings(df, text_columns, model_name='sentence-transformers/LaBSE', svd_dim=50):
|
125 |
+
model = SentenceTransformer(model_name)
|
126 |
+
os.makedirs("/content/drive/MyDrive/model_artifacts", exist_ok=True)
|
127 |
+
os.makedirs("/content/drive/MyDrive/embeddings", exist_ok=True)
|
128 |
+
for col in text_columns:
|
129 |
+
embedding_file = f"/content/drive/MyDrive/embeddings/{col}_embeddings.npy"
|
130 |
+
svd_file = f"/content/drive/MyDrive/model_artifacts/{col}_svd.pkl"
|
131 |
+
if os.path.exists(embedding_file):
|
132 |
+
print(f"Loading saved embeddings for column '{col}'...")
|
133 |
+
embeddings = np.load(embedding_file)
|
134 |
+
else:
|
135 |
+
print(f"Computing embeddings for column '{col}'...")
|
136 |
+
embeddings = model.encode(df[col].tolist(), show_progress_bar=True)
|
137 |
+
np.save(embedding_file, embeddings)
|
138 |
+
|
139 |
+
print(f"Fitting SVD for column '{col}'...")
|
140 |
+
svd = TruncatedSVD(n_components=svd_dim, random_state=42)
|
141 |
+
svd.fit(embeddings)
|
142 |
+
joblib.dump(svd, svd_file)
|
143 |
+
|
144 |
+
reduced = svd.transform(embeddings)
|
145 |
+
embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])])
|
146 |
+
embed_df.index = df.index # Force matching index
|
147 |
+
df = pd.concat([df, embed_df], axis=1)
|
148 |
+
return df
|
149 |
+
|
150 |
+
|
151 |
+
# --- Step 3: Build Preprocessor ---
|
152 |
+
def build_preprocessor(numeric_features, categorical_features, multilabel_fields):
|
153 |
+
numeric_pipeline = SKPipeline([
|
154 |
+
('imputer', SimpleImputer(strategy='median')),
|
155 |
+
('scaler', StandardScaler())], memory="cache_dir"
|
156 |
+
)
|
157 |
+
|
158 |
+
categorical_pipeline = SKPipeline([
|
159 |
+
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
|
160 |
+
('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))], memory="cache_dir"
|
161 |
+
)
|
162 |
+
|
163 |
+
transformers = [
|
164 |
+
('num', numeric_pipeline, numeric_features),
|
165 |
+
('cat', categorical_pipeline, categorical_features),
|
166 |
+
*[(f'mlb_{col}', MultiLabelBinarizerTransformer(), col) for col in multilabel_fields]]
|
167 |
+
|
168 |
+
|
169 |
+
return ColumnTransformer(transformers, sparse_threshold=0.0)
|
170 |
+
|
171 |
+
# --- Step 4: Build Pipeline ---
|
172 |
+
def build_pipeline(preprocessor, base_model, k=250):
|
173 |
+
return ImbPipeline(steps=[
|
174 |
+
('preprocessor', preprocessor),
|
175 |
+
('anomaly', AnomalyScoreTransformer()),
|
176 |
+
('resample', ADASYN()),
|
177 |
+
("variance_filter", VarianceThreshold(threshold=0.0)),
|
178 |
+
('feature_select', SelectKBest(score_func=f_classif, k=k)),
|
179 |
+
('classifier', CalibratedClassifierCV(estimator=base_model, method='isotonic', cv=3))
|
180 |
+
])
|
181 |
+
|
182 |
+
# --- Step 5: Drift Monitoring ---
|
183 |
+
def monitor_drift(reference, current, feature_names, output_html='drift_report.html'):
|
184 |
+
ref_df = pd.DataFrame(reference, columns=feature_names)
|
185 |
+
cur_df = pd.DataFrame(current, columns=feature_names)
|
186 |
+
|
187 |
+
report = Report(metrics=[DataDriftPreset()])
|
188 |
+
report.run(reference_data=ref_df, current_data=cur_df)
|
189 |
+
report.save_html(output_html)
|
190 |
+
print(f"✅ Drift report saved to {output_html}")
|
191 |
+
|
192 |
+
|
193 |
+
# --- Step 6: Evaluation + SHAP ---
|
194 |
+
def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names):
|
195 |
+
model.fit(X_train, y_train)
|
196 |
+
y_pred = model.predict(X_test)
|
197 |
+
print(classification_report(y_test, y_pred))
|
198 |
+
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
|
199 |
+
plt.title("Evaluation")
|
200 |
+
plt.tight_layout()
|
201 |
+
plt.show()
|
202 |
+
|
203 |
+
X_proc = model.named_steps['preprocessor'].transform(X_test)
|
204 |
+
if scipy.sparse.issparse(X_proc):
|
205 |
+
X_proc = X_proc.toarray()
|
206 |
+
|
207 |
+
selector = model.named_steps['feature_select']
|
208 |
+
X_selected = selector.transform(X_proc)
|
209 |
+
|
210 |
+
explainer = shap.Explainer(model.named_steps['classifier'].base_estimator, feature_names=feature_names)
|
211 |
+
shap_values = explainer(X_selected)
|
212 |
+
shap.summary_plot(shap_values, X_selected)
|
213 |
+
|
214 |
+
# --- Final Orchestration ---
|
215 |
+
def status_prediction_model(df):
|
216 |
+
os.makedirs("model_artifacts", exist_ok=True)
|
217 |
+
print("🧹 Preparing data...")
|
218 |
+
df = prepare_data(df, is_train=True)
|
219 |
+
print("💡 Embedding text...")
|
220 |
+
df = compute_embeddings(df, ['title', 'objective'])
|
221 |
+
|
222 |
+
text_embed_cols = [col for col in df.columns if '_embed_' in col]
|
223 |
+
numeric_features = ['durationDays', 'ecMaxContribution', 'totalCost',
|
224 |
+
'n_partners', 'n_country', 'n_sme'] + text_embed_cols
|
225 |
+
categorical_features = ['fundingScheme', 'legalBasis', 'nature']
|
226 |
+
multilabel_fields = ['list_country', 'list_activityType', 'list_deliverableType',
|
227 |
+
'list_availableLanguages', 'list_euroSciVocTitle','euroSciVoc_intermediate']
|
228 |
+
|
229 |
+
|
230 |
+
df = df[numeric_features + categorical_features + multilabel_fields + ['label']]
|
231 |
+
X = df.drop(columns='label')
|
232 |
+
y = df['label']
|
233 |
+
|
234 |
+
|
235 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
|
236 |
+
|
237 |
+
print("🧱 Building pipeline...")
|
238 |
+
preprocessor = build_preprocessor(numeric_features, categorical_features, multilabel_fields)
|
239 |
+
base_model = XGBClassifier(eval_metric='logloss', n_jobs=-1)
|
240 |
+
|
241 |
+
print("🎯 Training model with Optuna...")
|
242 |
+
def objective(trial):
|
243 |
+
params = {
|
244 |
+
'n_estimators': trial.suggest_int('n_estimators', 100, 300),
|
245 |
+
'max_depth': trial.suggest_int('max_depth', 3, 10),
|
246 |
+
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
|
247 |
+
'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2.0, 10.0)
|
248 |
+
}
|
249 |
+
base_model.set_params(**params)
|
250 |
+
pipeline = build_pipeline(preprocessor, base_model)
|
251 |
+
scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(3, shuffle=True, random_state=42),
|
252 |
+
scoring=make_scorer(f1_score, pos_label=1),n_jobs=-1)
|
253 |
+
return scores.mean()
|
254 |
+
|
255 |
+
study = optuna.create_study(direction='maximize')
|
256 |
+
study.optimize(objective, n_trials=24,n_jobs=6)
|
257 |
+
best_params = study.best_trial.params
|
258 |
+
base_model.set_params(**best_params)
|
259 |
+
|
260 |
+
print("✅ Training final model and evaluating...")
|
261 |
+
final_pipeline = build_pipeline(preprocessor, base_model)
|
262 |
+
selector = final_pipeline.named_steps['feature_select']
|
263 |
+
if hasattr(selector, 'get_support'):
|
264 |
+
feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())[selector.get_support()]
|
265 |
+
else:
|
266 |
+
feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())
|
267 |
+
evaluate_model(final_pipeline, X_train, X_test, y_train, y_test, feature_names)
|
268 |
+
|
269 |
+
print("📊 Monitoring drift...")
|
270 |
+
ref_data = preprocessor.transform(X_train)
|
271 |
+
cur_data = preprocessor.transform(X_test)
|
272 |
+
if scipy.sparse.issparse(ref_data): ref_data = ref_data.toarray()
|
273 |
+
if scipy.sparse.issparse(cur_data): cur_data = cur_data.toarray()
|
274 |
+
monitor_drift(pd.DataFrame(ref_data), pd.DataFrame(cur_data), feature_names)
|
275 |
+
print("💾 Saving model and artifacts...")
|
276 |
+
joblib.dump(final_pipeline, "model_artifacts/model.pkl")
|
277 |
+
joblib.dump(preprocessor, "model_artifacts/preprocessor.pkl")
|
278 |
+
X_train.to_csv("model_artifacts/X_train_processed.csv", index=False)
|
279 |
+
y_train.to_csv("model_artifacts/y_train.csv", index=False)
|
280 |
+
feature_config = {
|
281 |
+
"numeric_features": numeric_features,
|
282 |
+
"categorical_features": categorical_features,
|
283 |
+
"multilabel_fields": multilabel_fields
|
284 |
+
}
|
285 |
+
json.dump(feature_config, open("model_artifacts/feature_config.json", "w"))
|
286 |
+
print("✅ Training complete. Model artifacts saved.")
|
287 |
+
|
288 |
+
def score(new_df, model_dir="model_artifacts"):
|
289 |
+
# 1) Load artifacts
|
290 |
+
pipe = joblib.load(os.path.join(model_dir, "model.pkl"))
|
291 |
+
config = json.load(open(os.path.join(model_dir, "feature_config.json")))
|
292 |
+
|
293 |
+
# 2) Prepare & embed exactly as in training
|
294 |
+
df = prepare_data(new_df.copy(), is_train=False)
|
295 |
+
text_cols = ['title', 'objective']
|
296 |
+
sbert = SentenceTransformer('sentence-transformers/LaBSE')
|
297 |
+
for col in text_cols:
|
298 |
+
# load the SVD you trained
|
299 |
+
svd = joblib.load(os.path.join(model_dir, f"{col}_svd.pkl"))
|
300 |
+
emb = sbert.encode(df[col].tolist(), show_progress_bar=False)
|
301 |
+
reduced = svd.transform(emb)
|
302 |
+
emb_df = pd.DataFrame(reduced,
|
303 |
+
columns=[f"{col}_embed_{i}" for i in range(reduced.shape[1])],
|
304 |
+
index=df.index)
|
305 |
+
df = pd.concat([df, emb_df], axis=1)
|
306 |
+
|
307 |
+
# 3) Build the final feature set
|
308 |
+
X = df[ config["numeric_features"]
|
309 |
+
+ config["categorical_features"]
|
310 |
+
+ config["multilabel_fields"] ]
|
311 |
+
|
312 |
+
# 4) Predict & attach to DataFrame
|
313 |
+
preds = pipe.predict(X)
|
314 |
+
probs = pipe.predict_proba(X)[:, 1] # assume binary and positive class = index 1
|
315 |
+
df["predicted_label"] = preds
|
316 |
+
df["predicted_prob"] = probs
|
317 |
+
|
318 |
+
# 5) SHAP explanations on the *selected* features
|
319 |
+
# (we need to re-run preprocessing + feature_selection)
|
320 |
+
preproc = pipe.named_steps["preprocessor"]
|
321 |
+
select = pipe.named_steps["feature_select"]
|
322 |
+
clf = pipe.named_steps["classifier"].base_estimator
|
323 |
+
|
324 |
+
X_proc = preproc.transform(X)
|
325 |
+
if scipy.sparse.issparse(X_proc):
|
326 |
+
X_proc = X_proc.toarray()
|
327 |
+
X_sel = select.transform(X_proc)
|
328 |
+
|
329 |
+
feature_names = select.get_feature_names_out(
|
330 |
+
preproc.get_feature_names_out()
|
331 |
+
)
|
332 |
+
|
333 |
+
# Use a TreeExplainer directly on the XGB base estimator
|
334 |
+
explainer = shap.Explainer(clf, X_sel, feature_names=feature_names)
|
335 |
+
shap_vals = explainer(X_sel) # returns a ShapleyValues object
|
336 |
+
|
337 |
+
# 6) For each row, pick top-3 absolute contributors
|
338 |
+
shap_df = pd.DataFrame(shap_vals.values, columns=feature_names, index=df.index)
|
339 |
+
abs_shap = shap_df.abs()
|
340 |
+
|
341 |
+
top_feats = abs_shap.apply(lambda row: row.nlargest(4).index.tolist(), axis=1)
|
342 |
+
top_vals = abs_shap.apply(lambda row: row.nlargest(4).values.tolist(), axis=1)
|
343 |
+
|
344 |
+
df[["top1_feature","top2_feature","top3_feature","top4_feature"]] = pd.DataFrame(
|
345 |
+
top_feats.tolist(), index=df.index
|
346 |
+
)
|
347 |
+
df[["top1_shap","top2_shap","top3_shap","top4_shap"]] = pd.DataFrame(
|
348 |
+
top_vals.tolist(), index=df.index
|
349 |
+
)
|
350 |
+
|
351 |
+
return df
|
352 |
+
|
353 |
+
if __name__ == "__main__":
|
354 |
+
df = pd.read_csv("your_data.csv")
|
355 |
+
|
356 |
+
status_prediction_model(df)
|
357 |
+
|
358 |
+
new_df = pd.read_csv("new_data.csv")
|
359 |
+
scored_df = score(new_df)
|
360 |
+
print(scored_df.head())
|
rag_test.py
DELETED
@@ -1,223 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
import faiss
|
5 |
-
import pickle
|
6 |
-
import spacy
|
7 |
-
import re
|
8 |
-
from sentence_transformers import SentenceTransformer
|
9 |
-
from langchain.prompts import PromptTemplate
|
10 |
-
from langchain.llms import OpenAI
|
11 |
-
import matplotlib.pyplot as plt
|
12 |
-
|
13 |
-
# --- Load Models and Data ---
|
14 |
-
@st.cache_resource
|
15 |
-
def load_models_and_data():
|
16 |
-
nlp = spacy.load("en_core_web_sm")
|
17 |
-
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
18 |
-
|
19 |
-
projects_df = pd.read_parquet("cordis_projects.parquet")
|
20 |
-
with open("topic_summaries.pkl", "rb") as f:
|
21 |
-
topic_summaries_df = pickle.load(f)
|
22 |
-
|
23 |
-
project_index = faiss.read_index("project_chunks.faiss")
|
24 |
-
topic_index = faiss.read_index("topic_summary_index.faiss")
|
25 |
-
|
26 |
-
return nlp, embedding_model, projects_df, topic_summaries_df, project_index, topic_index
|
27 |
-
|
28 |
-
nlp, embedding_model, projects_df, topic_summaries_df, project_index, topic_index = load_models_and_data()
|
29 |
-
|
30 |
-
# --- RAG Components ---
|
31 |
-
|
32 |
-
class SessionContext:
|
33 |
-
def __init__(self):
|
34 |
-
self.last_query_type = None
|
35 |
-
self.last_entity = None
|
36 |
-
self.last_projects_df = None
|
37 |
-
self.last_project_id = None
|
38 |
-
|
39 |
-
def update(self, query_type, entity, df=None, project_id=None):
|
40 |
-
self.last_query_type = query_type
|
41 |
-
self.last_entity = entity
|
42 |
-
self.last_projects_df = df
|
43 |
-
self.last_project_id = project_id
|
44 |
-
|
45 |
-
session_context = SessionContext()
|
46 |
-
|
47 |
-
def classify_query_type(query: str) -> str:
|
48 |
-
query = query.lower()
|
49 |
-
if re.search(r'\b(project|grant agreement|projectid|rcn|ga)\b', query):
|
50 |
-
return "project"
|
51 |
-
if re.search(r'\b(topic|eurovoc|euro-scivoc)\b', query):
|
52 |
-
return "topic"
|
53 |
-
if re.search(r'\b(organization|institution|company|beneficiary)\b', query):
|
54 |
-
return "organization"
|
55 |
-
if re.search(r'\b(legalbasis|legislation|h2020|fp7)\b', query):
|
56 |
-
return "legalBasis"
|
57 |
-
return "general"
|
58 |
-
|
59 |
-
def extract_entities_custom(query: str) -> dict:
|
60 |
-
entities = {"project_id": None, "organization": None, "topic": None}
|
61 |
-
doc = nlp(query)
|
62 |
-
match = re.search(r"\b\d{6,8}\b", query)
|
63 |
-
if match:
|
64 |
-
entities["project_id"] = match.group(0)
|
65 |
-
for ent in doc.ents:
|
66 |
-
if ent.label_ == "ORG":
|
67 |
-
entities["organization"] = ent.text
|
68 |
-
if ent.label_ == "MISC":
|
69 |
-
entities["topic"] = ent.text
|
70 |
-
return entities
|
71 |
-
|
72 |
-
def retrieve_project_chunks_by_id(project_id: str) -> list:
|
73 |
-
subset = projects_df[projects_df['project_id'] == project_id]
|
74 |
-
return subset['chunk_text'].tolist()
|
75 |
-
|
76 |
-
def retrieve_topic_summary_from_project(project_id: str) -> str:
|
77 |
-
row = projects_df[projects_df['project_id'] == project_id]
|
78 |
-
if row.empty:
|
79 |
-
return ""
|
80 |
-
topic = row['topic_path'].values[0]
|
81 |
-
match = topic_summaries_df[topic_summaries_df['topic'] == topic]
|
82 |
-
return match['summary'].values[0] if not match.empty else ""
|
83 |
-
|
84 |
-
def get_kpi_context() -> str:
|
85 |
-
return (
|
86 |
-
"Average project duration: 780 days\n"
|
87 |
-
"Termination rate: 12.3%\n"
|
88 |
-
"Top countries by project count: Germany, France, Italy\n"
|
89 |
-
"Common termination reasons: coordination failure, underperformance"
|
90 |
-
)
|
91 |
-
|
92 |
-
def run_reasoning(question: str, df: pd.DataFrame) -> str:
|
93 |
-
q = question.lower()
|
94 |
-
if "average" in q and "funding" in q:
|
95 |
-
avg = df['ecMaxContribution'].mean()
|
96 |
-
return f"The average funding is €{avg:,.2f}."
|
97 |
-
return ""
|
98 |
-
|
99 |
-
def generate_follow_up_suggestions(query_type: str) -> str:
|
100 |
-
if query_type == "project":
|
101 |
-
return "Would you like more info on related topics or organizations involved?"
|
102 |
-
if query_type == "organization":
|
103 |
-
return "Want to explore more projects from this organization?"
|
104 |
-
if query_type == "topic":
|
105 |
-
return "Would you like to see top projects under this topic?"
|
106 |
-
return ""
|
107 |
-
|
108 |
-
def is_follow_up_question(question: str) -> bool:
|
109 |
-
q = question.lower()
|
110 |
-
return any(phrase in q for phrase in [
|
111 |
-
"they", "those", "what do they", "what are they about", "explain them", "what's the topic"
|
112 |
-
])
|
113 |
-
|
114 |
-
def summarize_projects(df: pd.DataFrame, field="objective", top_n=5) -> str:
|
115 |
-
texts = df[field].dropna().tolist()[:top_n]
|
116 |
-
if not texts:
|
117 |
-
return "No objectives or summaries available for these projects."
|
118 |
-
combined_text = "\n\n".join(texts)
|
119 |
-
prompt = f"Summarize what these projects are generally about:\n\n{combined_text}"
|
120 |
-
return OpenAI(temperature=0)(prompt)
|
121 |
-
|
122 |
-
rag_template = PromptTemplate(
|
123 |
-
input_variables=["project_info", "topic_summary", "kpi_context", "programmatic", "question", "followup"],
|
124 |
-
template="""
|
125 |
-
You are a research assistant for a European funding agency. Use the provided context to answer the user query.
|
126 |
-
|
127 |
-
--- Project Info ---
|
128 |
-
{project_info}
|
129 |
-
|
130 |
-
--- Topic Summary ---
|
131 |
-
{topic_summary}
|
132 |
-
|
133 |
-
--- KPI Context ---
|
134 |
-
{kpi_context}
|
135 |
-
|
136 |
-
--- Data Insights ---
|
137 |
-
{programmatic}
|
138 |
-
|
139 |
-
--- Question ---
|
140 |
-
{question}
|
141 |
-
|
142 |
-
--- Answer ---
|
143 |
-
|
144 |
-
Also consider:
|
145 |
-
{followup}
|
146 |
-
"""
|
147 |
-
)
|
148 |
-
|
149 |
-
def chat_with_context(question: str, session: SessionContext) -> str:
|
150 |
-
if is_follow_up_question(question) and session.last_projects_df is not None:
|
151 |
-
return summarize_projects(session.last_projects_df)
|
152 |
-
|
153 |
-
query_type = classify_query_type(question)
|
154 |
-
entities = extract_entities_custom(question)
|
155 |
-
project_info = topic_summary = programmatic = ""
|
156 |
-
relevant_df = projects_df.copy()
|
157 |
-
project_id = None
|
158 |
-
|
159 |
-
if entities["project_id"]:
|
160 |
-
project_id = entities["project_id"]
|
161 |
-
project_info = "\n".join(retrieve_project_chunks_by_id(project_id))
|
162 |
-
topic_summary = retrieve_topic_summary_from_project(project_id)
|
163 |
-
relevant_df = projects_df[projects_df['project_id'] == project_id]
|
164 |
-
|
165 |
-
elif query_type == "organization":
|
166 |
-
org = entities.get("organization")
|
167 |
-
if org:
|
168 |
-
relevant_df = projects_df[projects_df['list_name'].str.contains(org, case=False, na=False)]
|
169 |
-
project_info = "\n".join(relevant_df['chunk_text'].head(3))
|
170 |
-
|
171 |
-
programmatic = run_reasoning(question, relevant_df)
|
172 |
-
kpi_context = get_kpi_context()
|
173 |
-
followup = generate_follow_up_suggestions(query_type)
|
174 |
-
|
175 |
-
prompt = rag_template.format(
|
176 |
-
project_info=project_info,
|
177 |
-
topic_summary=topic_summary,
|
178 |
-
kpi_context=kpi_context,
|
179 |
-
programmatic=programmatic,
|
180 |
-
question=question,
|
181 |
-
followup=followup
|
182 |
-
)
|
183 |
-
|
184 |
-
session.update(query_type, entities.get("organization") or entities.get("project_id"), relevant_df, project_id)
|
185 |
-
|
186 |
-
return OpenAI(temperature=0)(prompt)
|
187 |
-
|
188 |
-
# --- Streamlit UI ---
|
189 |
-
|
190 |
-
st.set_page_config(page_title="EU Funding Explorer", layout="wide")
|
191 |
-
st.title("🇪🇺 EU Projects Dashboard")
|
192 |
-
|
193 |
-
tabs = st.tabs(["📊 Dashboard", "📁 Projects + Chatbot"])
|
194 |
-
|
195 |
-
# --- Tab 1: Dashboard ---
|
196 |
-
with tabs[0]:
|
197 |
-
st.subheader("Funding Overview")
|
198 |
-
funding_by_year = projects_df.groupby("startYear")["ecMaxContribution"].sum().reset_index()
|
199 |
-
plt.figure(figsize=(10,4))
|
200 |
-
plt.bar(funding_by_year["startYear"], funding_by_year["ecMaxContribution"] / 1e6)
|
201 |
-
plt.ylabel("Total Funding (€M)")
|
202 |
-
st.pyplot(plt)
|
203 |
-
|
204 |
-
top_orgs = projects_df["list_name"].value_counts().head(10)
|
205 |
-
st.bar_chart(top_orgs)
|
206 |
-
|
207 |
-
# --- Tab 2: Projects + Chatbot ---
|
208 |
-
with tabs[1]:
|
209 |
-
left, right = st.columns([2, 1])
|
210 |
-
|
211 |
-
with left:
|
212 |
-
st.subheader("Browse Projects")
|
213 |
-
page_size = 10
|
214 |
-
page_num = st.number_input("Page", min_value=0, max_value=(len(projects_df) // page_size), step=1)
|
215 |
-
paginated = projects_df.iloc[page_num * page_size : (page_num + 1) * page_size]
|
216 |
-
st.dataframe(paginated[["title", "list_name", "ecMaxContribution", "startYear", "status"]], use_container_width=True)
|
217 |
-
|
218 |
-
with right:
|
219 |
-
st.subheader("Ask the Chatbot")
|
220 |
-
user_input = st.text_input("Ask a question...")
|
221 |
-
if st.button("Ask") and user_input:
|
222 |
-
answer = chat_with_context(user_input, session_context)
|
223 |
-
st.markdown(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|