Rom89823974978 commited on
Commit
69e8901
·
1 Parent(s): fa1006f
Files changed (5) hide show
  1. DataExploration.ipynb +57 -4
  2. backend/main.py +15 -12
  3. data_enhancement.py +442 -0
  4. predictive_modelling.py +360 -0
  5. rag_test.py +0 -223
DataExploration.ipynb CHANGED
@@ -599,19 +599,72 @@
599
  },
600
  {
601
  "cell_type": "code",
602
- "execution_count": 1,
603
  "metadata": {},
604
  "outputs": [],
605
  "source": [
606
  "import polars as pl\n",
607
  "import pathlib\n",
608
- "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\Masters\\MBIS\\Year 2\\Semester 2\\Modern Data Analytics\\CORDIS\")\n",
609
- "OUTDIR = ROOT / \"combined\"\n",
610
  "\n",
611
- "consolidated = pl.read_parquet(OUTDIR / \"consolidated.parquet\")\n",
612
  "consolidated_clean = pl.read_parquet(OUTDIR / \"consolidated_clean.parquet\")\n"
613
  ]
614
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  {
616
  "cell_type": "markdown",
617
  "metadata": {},
 
599
  },
600
  {
601
  "cell_type": "code",
602
+ "execution_count": 2,
603
  "metadata": {},
604
  "outputs": [],
605
  "source": [
606
  "import polars as pl\n",
607
  "import pathlib\n",
608
+ "ROOT = pathlib.Path(r\"C:\\Users\\Romain\\OneDrive - KU Leuven\\MDA\\backend\\data\")\n",
609
+ "OUTDIR = ROOT #/ \"combined\"\n",
610
  "\n",
611
+ "#consolidated = pl.read_parquet(OUTDIR / \"consolidated.parquet\")\n",
612
  "consolidated_clean = pl.read_parquet(OUTDIR / \"consolidated_clean.parquet\")\n"
613
  ]
614
  },
615
+ {
616
+ "cell_type": "code",
617
+ "execution_count": null,
618
+ "metadata": {},
619
+ "outputs": [
620
+ {
621
+ "data": {
622
+ "text/html": [
623
+ "<div><style>\n",
624
+ ".dataframe > thead > tr,\n",
625
+ ".dataframe > tbody > tr {\n",
626
+ " text-align: right;\n",
627
+ " white-space: pre-wrap;\n",
628
+ "}\n",
629
+ "</style>\n",
630
+ "<small>shape: (5, 68)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>id</th><th>acronym</th><th>status</th><th>title</th><th>startDate</th><th>endDate</th><th>totalCost</th><th>ecMaxContribution</th><th>legalBasis</th><th>topics</th><th>ecSignatureDate</th><th>frameworkProgramme</th><th>masterCall</th><th>subCall</th><th>fundingScheme</th><th>nature</th><th>objective</th><th>contentUpdateDate</th><th>rcn</th><th>grantDoi</th><th>programmeFolder</th><th>list_deliverableType</th><th>list_url</th><th>list_contentUpdateDate</th><th>list_authors</th><th>list_title</th><th>list_doi</th><th>list_journalTitle</th><th>list_isPublishedAs</th><th>list_publishedYear</th><th>list_contentUpdateDate_publi</th><th>list_title_report</th><th>list_attachment</th><th>list_contentUpdateDate_report</th><th>list_organisationID</th><th>list_country</th><th>list_name</th><th>list_SME</th><th>list_city</th><th>list_geolocation</th><th>list_organizationURL</th><th>list_role</th><th>list_ecContribution</th><th>list_netEcContribution</th><th>list_totalCost</th><th>list_endOfParticipation</th><th>list_activityType</th><th>list_contentUpdateDate_org</th><th>list_physUrl</th><th>list_availableLanguages</th><th>list_status</th><th>list_archivedDate</th><th>list_type</th><th>list_source</th><th>list_represents</th><th>list_legalBasis</th><th>list_title_legal</th><th>list_uniqueProgrammePart</th><th>list_topic</th><th>list_title_topic</th><th>list_euroSciVocTitle</th><th>list_euroSciVocPath</th><th>list_description</th><th>netEcContribution</th><th>startYear</th><th>endYear</th><th>durationDays</th><th>ecRatio</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>date</td><td>date</td><td>f64</td><td>f64</td><td>str</td><td>str</td><td>date</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>datetime[μs]</td><td>i64</td><td>str</td><td>str</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[f64]</td><td>list[str]</td><td>list[str]</td><td>list[datetime[μs]]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>list[str]</td><td>f64</td><td>i32</td><td>i32</td><td>i64</td><td>f64</td></tr></thead><tbody><tr><td>&quot;624794&quot;</td><td>&quot;COMPACTABILITY&quot;</td><td>&quot;CLOSED&quot;</td><td>&quot;Contribution of Compact Neighb…</td><td>2014-12-01</td><td>2016-11-30</td><td>309235.2</td><td>309235.2</td><td>&quot;FP7-PEOPLE&quot;</td><td>&quot;FP7-PEOPLE-2013-IEF&quot;</td><td>null</td><td>&quot;FP7&quot;</td><td>null</td><td>&quot;FP7-PEOPLE-2013-IEF&quot;</td><td>&quot;MC-IEF&quot;</td><td>null</td><td>&quot;This research investigates how…</td><td>2017-04-10 11:25:29</td><td>187874</td><td>null</td><td>&quot;H2013&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;Final Report Summary - COMPACTABILITY (Contribution of Compact Neighbourhoods to Social Sustainability)&quot;]</td><td>[&quot;/docs/results/624/624794/final1-table-1.jpg&quot;]</td><td>[2017-03-07 17:25:15]</td><td>[&quot;999446873&quot;]</td><td>[&quot;UK&quot;]</td><td>[&quot;OXFORD BROOKES UNIVERSITY&quot;]</td><td>[null]</td><td>[&quot;Oxford&quot;]</td><td>[&quot;51.7520131,-1.2578498&quot;]</td><td>[&quot;http://www.brookes.ac.uk&quot;]</td><td>[&quot;coordinator&quot;]</td><td>[&quot;309235.2&quot;]</td><td>[null]</td><td>[null]</td><td>[&quot;false&quot;]</td><td>[&quot;HES&quot;]</td><td>[2017-04-10 11:25:29]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;FP7-PEOPLE&quot;]</td><td>[&quot;Specific programme &quot;People&quot; implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)&quot;]</td><td>[null]</td><td>[&quot;FP7-PEOPLE-2013-IEF&quot;]</td><td>[&quot;Marie-Curie Action: Intra-European fellowships for career development&quot;]</td><td>null</td><td>null</td><td>null</td><td>0.0</td><td>2014</td><td>2016</td><td>730</td><td>0.0</td></tr><tr><td>&quot;276810&quot;</td><td>&quot;ARCHOSL&quot;</td><td>&quot;CLOSED&quot;</td><td>&quot;Archives of Early Human Occupa…</td><td>2011-03-01</td><td>2014-02-28</td><td>75000.0</td><td>75000.0</td><td>&quot;FP7-PEOPLE&quot;</td><td>&quot;FP7-PEOPLE-2009-RG&quot;</td><td>null</td><td>&quot;FP7&quot;</td><td>null</td><td>&quot;FP7-PEOPLE-2010-RG&quot;</td><td>&quot;MC-IRG&quot;</td><td>null</td><td>&quot;A number of important archaeol…</td><td>2019-08-02 13:24:51</td><td>98178</td><td>null</td><td>&quot;H2013&quot;</td><td>null</td><td>null</td><td>null</td><td>[&quot;Arnold, L.J., Demuro, M., Parés, J.M., Arsuaga, J.L., Aranburu, A.,&quot;, &quot;Lee J. Arnold , Martina Demuro , Marta Navazo , Alfonso Benito-Calvo , Alfredo Pérez-González&quot;, … &quot;F. Gutiérrez , B. Valero-Garcés , G. Desir , P. González-Sampériz , M. Gutiérrez , R. Linares , M. Zarroca , A. Moreno , J. Guerrero , C. Roqué&quot;]</td><td>[&quot;Luminescence dating and palaeomagnetic age constraint on hominins from Sima de los Huesos, Atapuerca, Spain&quot;, &quot;OSL dating of the Middle Palaeolithic Hotel California site, Sierra de Atapuerca, north-central Spain&quot;, … &quot;Late Holocene evolution of playa lakes in the central Ebro depression based on geophysical surveys and morpho-stratigraphic analysis of lacustrine terraces&quot;]</td><td>[&quot;http://dx.doi.org/10.1016/j.jhevol.2013.12.001&quot;, &quot;10.1111/j.1502-3885.2012.00262.x&quot;, … &quot;http://dx.doi.org/10.1016/j.geomorph.2012.02.013&quot;]</td><td>[&quot;Journal of Human Evolution&quot;, &quot;Boreas&quot;, … &quot;Geomorphology&quot;]</td><td>[&quot;PEER REVIEWED ARTICLE&quot;, &quot;PEER REVIEWED ARTICLE&quot;, … &quot;PEER REVIEWED ARTICLE&quot;]</td><td>[null, null, … null]</td><td>[null, null, … null]</td><td>[&quot;Final Report Summary - ARCHOSL (Archives of Early Human Occupation in Western Europe: OSL Chronologies beyond the Middle Pleistocene in the Iberian Peninsula)&quot;]</td><td>[null]</td><td>[2014-11-07 13:26:06]</td><td>[&quot;986579241&quot;]</td><td>[&quot;ES&quot;]</td><td>[&quot;CENTRO NACIONAL DE INVESTIGACION SOBRE LA EVOLUCION HUMANA&quot;]</td><td>[null]</td><td>[&quot;Burgos&quot;]</td><td>[&quot;42.3396185,-3.6967044&quot;]</td><td>[&quot;http://www.cenieh.es&quot;]</td><td>[&quot;coordinator&quot;]</td><td>[&quot;75000&quot;]</td><td>[null]</td><td>[null]</td><td>[&quot;false&quot;]</td><td>[&quot;REC&quot;]</td><td>[2019-08-02 13:24:51]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;FP7-PEOPLE&quot;]</td><td>[&quot;Specific programme &quot;People&quot; implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)&quot;]</td><td>[null]</td><td>[&quot;FP7-PEOPLE-2009-RG&quot;]</td><td>[&quot;Marie Curie Action: Reintegration Grants&quot;]</td><td>[&quot;ethnoarchaeology&quot;, &quot;physical anthropology&quot;]</td><td>[&quot;/humanities/history and archaeology/archaeology/ethnoarchaeology&quot;, &quot;/social sciences/sociology/anthropology/physical anthropology&quot;]</td><td>[null, null]</td><td>0.0</td><td>2011</td><td>2014</td><td>1095</td><td>0.0</td></tr><tr><td>&quot;622478&quot;</td><td>&quot;DETforDRF 2.0&quot;</td><td>&quot;CLOSED&quot;</td><td>&quot;Design and Expansion Turbine f…</td><td>null</td><td>null</td><td>161968.8</td><td>161968.8</td><td>&quot;FP7-PEOPLE&quot;</td><td>&quot;FP7-PEOPLE-2013-IEF&quot;</td><td>null</td><td>&quot;FP7&quot;</td><td>null</td><td>&quot;FP7-PEOPLE-2013-IEF&quot;</td><td>&quot;MC-IEF&quot;</td><td>null</td><td>&quot;This proposal for a Marie Curi…</td><td>2016-03-31 21:10:31</td><td>187686</td><td>null</td><td>&quot;H2013&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;953573536&quot;]</td><td>[&quot;DE&quot;]</td><td>[&quot;BSH HAUSGERATE GMBH&quot;]</td><td>[null]</td><td>[&quot;Munchen&quot;]</td><td>[&quot;48.0887063,11.6433468&quot;]</td><td>[&quot;http://www.bsh-group.com&quot;]</td><td>[&quot;coordinator&quot;]</td><td>[&quot;161968.8&quot;]</td><td>[null]</td><td>[null]</td><td>[&quot;false&quot;]</td><td>[&quot;PRC&quot;]</td><td>[2016-03-31 21:10:31]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;FP7-PEOPLE&quot;]</td><td>[&quot;Specific programme &quot;People&quot; implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)&quot;]</td><td>[null]</td><td>[&quot;FP7-PEOPLE-2013-IEF&quot;]</td><td>[&quot;Marie-Curie Action: Intra-European fellowships for career development&quot;]</td><td>[&quot;fluid dynamics&quot;]</td><td>[&quot;/natural sciences/physical sciences/classical mechanics/fluid mechanics/fluid dynamics&quot;]</td><td>[null]</td><td>0.0</td><td>null</td><td>null</td><td>null</td><td>0.0</td></tr><tr><td>&quot;615785&quot;</td><td>&quot;EMERGING SUBJECTS&quot;</td><td>&quot;CLOSED&quot;</td><td>&quot;Emerging Subjects of the New E…</td><td>2014-09-01</td><td>2019-06-30</td><td>1.658373e6</td><td>1.658373e6</td><td>&quot;FP7-IDEAS-ERC&quot;</td><td>&quot;ERC-CG-2013-SH2&quot;</td><td>null</td><td>&quot;FP7&quot;</td><td>null</td><td>&quot;ERC-2013-CoG&quot;</td><td>&quot;ERC-CG&quot;</td><td>null</td><td>&quot;This project examines how pred…</td><td>2023-04-05 11:40:06</td><td>188675</td><td>null</td><td>&quot;H2013&quot;</td><td>null</td><td>null</td><td>null</td><td>[&quot;Rebekah Plueckhahn&quot;, &quot;Dulam, Bumochir&quot;, … &quot;•Empson, R. A.&quot;]</td><td>[&quot;Tragic Spirits: Shamanism, Memory, and Gender in Contemporary Mongolia by Manduhai Buyandelger.&quot;, &quot;The Afterlife of Nomadism: Pastoralism, environmentalism, civilization and identity in Mongolia and China&quot;, … &quot;A Space That Will Never Be Filled Sharp Communication and the Simultaneity of Opposites.&quot;]</td><td>[&quot;10.1111/aman.12304&quot;, null, … null]</td><td>[&quot;American Anthropologist&quot;, &quot;Pastoralist Livelihoods in Asian Drylands: Environment, Governance and Risk&quot;, … &quot;Current Anthropology&quot;]</td><td>[&quot;PEER_REVIEWED_ARTICLE&quot;, &quot;ARTICLE&quot;, … &quot;ARTICLE&quot;]</td><td>[null, null, … null]</td><td>[null, null, … null]</td><td>[&quot;Final Report Summary - EMERGING SUBJECTS (Emerging Subjects of the New Economy: Tracing Economic Growth in Mongolia)&quot;]</td><td>[null]</td><td>[2018-01-15 17:25:25]</td><td>[&quot;888898146&quot;]</td><td>[null]</td><td>[&quot;UNIVERSITY COLLEGE LONDON&quot;]</td><td>[null]</td><td>[&quot;LONDON&quot;]</td><td>[&quot;51.5236746,-0.1339608&quot;]</td><td>[&quot;http://www.ucl.ac.uk&quot;]</td><td>[&quot;coordinator&quot;]</td><td>[&quot;1658373&quot;]</td><td>[null]</td><td>[null]</td><td>[&quot;false&quot;]</td><td>[&quot;HES&quot;]</td><td>[2023-04-05 11:40:06]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;FP7-IDEAS-ERC&quot;]</td><td>[&quot;Specific programme: &quot;Ideas&quot; implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)&quot;]</td><td>[null]</td><td>[&quot;ERC-CG-2013-SH2&quot;]</td><td>[&quot;ERC Consolidator Grant - Institutions Values Beliefs and behaviour&quot;]</td><td>[&quot;anthropology&quot;]</td><td>[&quot;/social sciences/sociology/anthropology&quot;]</td><td>[null]</td><td>0.0</td><td>2014</td><td>2019</td><td>1763</td><td>0.0</td></tr><tr><td>&quot;237010&quot;</td><td>&quot;DEER PALAEOBIOLOGY&quot;</td><td>&quot;CLOSED&quot;</td><td>&quot;Palaeobiological inference thr…</td><td>2009-04-09</td><td>2011-01-08</td><td>173416.47</td><td>173416.47</td><td>&quot;FP7-PEOPLE&quot;</td><td>&quot;FP7-PEOPLE-IEF-2008&quot;</td><td>null</td><td>&quot;FP7&quot;</td><td>null</td><td>&quot;FP7-PEOPLE-IEF-2008&quot;</td><td>&quot;MC-IEF&quot;</td><td>null</td><td>&quot;The present research aims to r…</td><td>2019-07-16 19:18:25</td><td>90424</td><td>null</td><td>&quot;H2013&quot;</td><td>null</td><td>null</td><td>null</td><td>[&quot;Lister, A.M., Breda, M. and others&quot;, &quot;Breda, M., Lister, A.M. &amp; others&quot;]</td><td>[&quot;Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. II. Cervidae, Equidae and Suidae.&quot;, &quot;Metric analysis of ungulate mammals in the early Middle Pleistocene of Britain, in relation to taxonomy and biostratigraphy. I: Rhinocerotidae and Bovidae.&quot;]</td><td>[null, null]</td><td>[&quot;Quaternary International&quot;, &quot;Quaternary International&quot;]</td><td>[&quot;PEER REVIEWED ARTICLE&quot;, &quot;PEER REVIEWED ARTICLE&quot;]</td><td>[null, null]</td><td>[null, null]</td><td>[&quot;Final Report Summary - DEER PALAEOBIOLOGY (Palaeobiological inference through phylogenetic analysis of Pleistocene deer)&quot;]</td><td>[null]</td><td>[2013-07-05 00:02:53]</td><td>[&quot;999642037&quot;]</td><td>[&quot;UK&quot;]</td><td>[&quot;NATURAL HISTORY MUSEUM&quot;]</td><td>[null]</td><td>[&quot;London&quot;]</td><td>[&quot;51.494882,-0.1847716&quot;]</td><td>[&quot;http://www.nhm.ac.uk/&quot;]</td><td>[&quot;coordinator&quot;]</td><td>[&quot;173416.47&quot;]</td><td>[null]</td><td>[null]</td><td>[&quot;false&quot;]</td><td>[&quot;PUB&quot;]</td><td>[2019-07-16 19:18:25]</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>[&quot;FP7-PEOPLE&quot;]</td><td>[&quot;Specific programme &quot;People&quot; implementing the Seventh Framework Programme of the European Community for research, technological development and demonstration activities (2007 to 2013)&quot;]</td><td>[null]</td><td>[&quot;FP7-PEOPLE-IEF-2008&quot;]</td><td>[&quot;Marie Curie Action: Intra-European Fellowships for Career Development&quot;]</td><td>[&quot;comparative morphology&quot;]</td><td>[&quot;/natural sciences/biological sciences/biological morphology/comparative morphology&quot;]</td><td>[null]</td><td>0.0</td><td>2009</td><td>2011</td><td>639</td><td>0.0</td></tr></tbody></table></div>"
631
+ ],
632
+ "text/plain": [
633
+ "shape: (5, 68)\n",
634
+ "┌────────┬──────────────┬────────┬──────────────┬───┬───────────┬─────────┬──────────────┬─────────┐\n",
635
+ "│ id ┆ acronym ┆ status ┆ title ┆ … ┆ startYear ┆ endYear ┆ durationDays ┆ ecRatio │\n",
636
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
637
+ "│ str ┆ str ┆ str ┆ str ┆ ┆ i32 ┆ i32 ┆ i64 ┆ f64 │\n",
638
+ "╞════════╪══════════════╪════════╪══════════════╪═══╪═══════════╪═════════╪══════════════╪═════════╡\n",
639
+ "│ 624794 ┆ COMPACTABILI ┆ CLOSED ┆ Contribution ┆ … ┆ 2014 ┆ 2016 ┆ 730 ┆ 0.0 │\n",
640
+ "│ ┆ TY ┆ ┆ of Compact ┆ ┆ ┆ ┆ ┆ │\n",
641
+ "│ ┆ ┆ ┆ Neighb… ┆ ┆ ┆ ┆ ┆ │\n",
642
+ "│ 276810 ┆ ARCHOSL ┆ CLOSED ┆ Archives of ┆ … ┆ 2011 ┆ 2014 ┆ 1095 ┆ 0.0 │\n",
643
+ "│ ┆ ┆ ┆ Early Human ┆ ┆ ┆ ┆ ┆ │\n",
644
+ "│ ┆ ┆ ┆ Occupa… ┆ ┆ ┆ ┆ ┆ │\n",
645
+ "│ 622478 ┆ DETforDRF ┆ CLOSED ┆ Design and ┆ … ┆ null ┆ null ┆ null ┆ 0.0 │\n",
646
+ "│ ┆ 2.0 ┆ ┆ Expansion ┆ ┆ ┆ ┆ ┆ │\n",
647
+ "│ ┆ ┆ ┆ Turbine f… ┆ ┆ ┆ ┆ ┆ │\n",
648
+ "│ 615785 ┆ EMERGING ┆ CLOSED ┆ Emerging ┆ … ┆ 2014 ┆ 2019 ┆ 1763 ┆ 0.0 │\n",
649
+ "│ ┆ SUBJECTS ┆ ┆ Subjects of ┆ ┆ ┆ ┆ ┆ │\n",
650
+ "│ ┆ ┆ ┆ the New E… ┆ ┆ ┆ ┆ ┆ │\n",
651
+ "│ 237010 ┆ DEER PALAEOB ┆ CLOSED ┆ Palaeobiolog ┆ … ┆ 2009 ┆ 2011 ┆ 639 ┆ 0.0 │\n",
652
+ "│ ┆ IOLOGY ┆ ┆ ical ┆ ┆ ┆ ┆ ┆ │\n",
653
+ "│ ┆ ┆ ┆ inference ┆ ┆ ┆ ┆ ┆ │\n",
654
+ "│ ┆ ┆ ┆ thr… ┆ ┆ ┆ ┆ ┆ │\n",
655
+ "└────────┴──────────────┴────────┴──────────────┴───┴───────────┴─────────┴──────────────┴─────────┘"
656
+ ]
657
+ },
658
+ "execution_count": 3,
659
+ "metadata": {},
660
+ "output_type": "execute_result"
661
+ }
662
+ ],
663
+ "source": [
664
+ "consolidated_clean.head()\n",
665
+ "#ecMaxContribution, endDate, status, legalBasis, frameworkProgramme, fundingScheme, list_title_report, list_name, list_role, list_city, list_country, list_ecContribution, list_activityType, durationDays"
666
+ ]
667
+ },
668
  {
669
  "cell_type": "markdown",
670
  "metadata": {},
backend/main.py CHANGED
@@ -9,10 +9,12 @@ from pydantic import BaseModel
9
  #except:
10
  # from .rag import get_rag_chain, RAGRequest, RAGResponse
11
  from contextlib import asynccontextmanager
12
-
13
  import polars as pl
14
  import gcsfs
15
 
 
 
16
  @asynccontextmanager
17
  async def lifespan(app: FastAPI):
18
  bucket = "mda_eu_project"
@@ -121,29 +123,30 @@ def get_stats(request: Request):
121
  @app.get("/api/project/{project_id}/organizations")
122
  def get_project_organizations(project_id: str):
123
  df = app.state.df
 
124
  sel = df.filter(pl.col("id") == project_id)
125
  if sel.is_empty():
126
  raise HTTPException(status_code=404, detail="Project not found")
127
 
128
  orgs_df = (
129
- sel.select([
130
- pl.explode("list_name").alias("name"),
131
- pl.explode("list_country").alias("country"),
132
- pl.explode("list_geolocation").alias("geoloc"),
 
133
  ])
134
  .with_columns([
135
- pl.col("geoloc")
136
- .str.split_exact(",", 1)
137
- .alias("latlon")
138
  ])
139
  .with_columns([
140
- pl.col("latlon").list.get(0).cast(float).alias("latitude"),
141
- pl.col("latlon").list.get(1).cast(float).alias("longitude")
142
  ])
143
  .filter(pl.col("name").is_not_null())
144
- .select(["name","country","latitude","longitude"])
145
  )
146
- print(orgs_df)
147
  return orgs_df.to_dicts()
148
 
149
  """def rag_chain_depender():
 
9
  #except:
10
  # from .rag import get_rag_chain, RAGRequest, RAGResponse
11
  from contextlib import asynccontextmanager
12
+ import os
13
  import polars as pl
14
  import gcsfs
15
 
16
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\Romain\OneDrive - KU Leuven\focal-pager-460414-e9-45369b738be0.json"
17
+
18
  @asynccontextmanager
19
  async def lifespan(app: FastAPI):
20
  bucket = "mda_eu_project"
 
123
  @app.get("/api/project/{project_id}/organizations")
124
  def get_project_organizations(project_id: str):
125
  df = app.state.df
126
+
127
  sel = df.filter(pl.col("id") == project_id)
128
  if sel.is_empty():
129
  raise HTTPException(status_code=404, detail="Project not found")
130
 
131
  orgs_df = (
132
+ sel
133
+ .select([
134
+ pl.col("list_name").explode().alias("name"),
135
+ pl.col("list_country").explode().alias("country"),
136
+ pl.col("list_geolocation").explode().alias("geoloc"),
137
  ])
138
  .with_columns([
139
+ # now this is a List(Utf8)
140
+ pl.col("geoloc").str.split(",").alias("latlon"),
 
141
  ])
142
  .with_columns([
143
+ pl.col("latlon").list.get(0).cast(pl.Float64).alias("latitude"),
144
+ pl.col("latlon").list.get(1).cast(pl.Float64).alias("longitude"),
145
  ])
146
  .filter(pl.col("name").is_not_null())
147
+ .select(["name", "country", "latitude", "longitude"])
148
  )
149
+
150
  return orgs_df.to_dicts()
151
 
152
  """def rag_chain_depender():
data_enhancement.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import re
3
+ import polars as pl
4
+ from __future__ import annotations
5
+ import re, csv, pathlib, polars as pl
6
+
7
+ ROOT = pathlib.Path(r"C:\Users\Romain\OneDrive - KU Leuven\Masters\MBIS\Year 2\Semester 2\Modern Data Analytics\CORDIS")
8
+ DATASETS = [
9
+ "project",
10
+ "projectDeliverables",
11
+ "projectPublications",
12
+ "reportSummaries",
13
+ "organization",
14
+ "euroSciVoc",
15
+ "topics",
16
+ "webItem",
17
+ "webLink",
18
+ "legalBasis",
19
+ ]
20
+ OUTDIR = ROOT / "combined"
21
+ OUTDIR.mkdir(exist_ok=True)
22
+
23
+ ###############################################################################
24
+ # 2. Generic cleaner –– parameterised version of the loop you wrote
25
+ ###############################################################################
26
+ _PROJECT_ID_RE = re.compile(r"^(?:19|20)\d{2}")
27
+ _GENERIC_NUM_RE = re.compile(r"\d{4}")
28
+
29
+ import csv, pathlib, polars as pl, re
30
+
31
+ import csv, re, pathlib
32
+ import polars as pl # >=0.20
33
+
34
+ import csv, pathlib, re
35
+ import polars as pl # ≥ 0.20
36
+
37
+
38
+ def _clean_one_file(csv_path: pathlib.Path,
39
+ number_regex: re.Pattern[str], dataset: str) -> pl.DataFrame:
40
+ """
41
+ Clean a CORDIS CSV whose long *objective* field sometimes explodes into
42
+ extra columns because of stray quotes / semicolons.
43
+
44
+ Strategy
45
+ --------
46
+ * A well-formed row has 21 semicolon-separated columns.
47
+ * If we get more than 21 columns we treat columns 16 … -4 as belonging
48
+ to *objective* and stitch them back together with a semicolon.
49
+ * The last three columns are contentUpdateDate | rcn | grantDoi.
50
+ """
51
+ # ---------- constants --------------------------------------------------
52
+ if dataset=="project":
53
+ EXPECTED_COLS = 20 # final width
54
+ TITLE_COL = 3 # 0-based index of *title*
55
+ DATE1_COL = 4 # 0-based index of startDate
56
+ DATE2_COL = 5 # 0-based index of endDate
57
+ OBJECTIVE_COL = 16 # 0-based index of objective
58
+ TRAILING_KEEP = 3 # last three fixed columns
59
+ elif dataset=="organization":
60
+ EXPECTED_COLS = 25 # final width
61
+ TITLE_COL = 3 # 0-based index of *title*
62
+ DATE1_COL = 4 # 0-based index of startDate
63
+ DATE2_COL = 5 # 0-based index of endDate
64
+ OBJECTIVE_COL = 4 # 0-based index of objective
65
+ TRAILING_KEEP = 20 # last three fixed columns
66
+ else:
67
+ EXPECTED_COLS = 20 # final width
68
+ TITLE_COL = 3 # 0-based index of *title*
69
+ DATE1_COL = 4 # 0-based index of startDate
70
+ DATE2_COL = 5 # 0-based index of endDate
71
+ OBJECTIVE_COL = 16 # 0-based index of objective
72
+ TRAILING_KEEP = 3 # last three fixed columns
73
+
74
+
75
+
76
+ date_rx = re.compile(r"\d{4}-\d{2}-\d{2}$")
77
+ is_date = lambda s: (s == "") or bool(date_rx.match(s))
78
+
79
+ tmp_clean = csv_path.with_suffix(".cleaned.csv")
80
+
81
+ with csv_path.open(encoding="utf-8", newline="") as fin, \
82
+ tmp_clean.open("w", encoding="utf-8", newline="") as fout:
83
+
84
+ writer = csv.writer(
85
+ fout,
86
+ delimiter="|",
87
+ quotechar='"',
88
+ quoting=csv.QUOTE_MINIMAL,
89
+ lineterminator="\n",
90
+ )
91
+
92
+ # ---------- iterate raw lines -------------------------------------
93
+ for raw in fin:
94
+ #print(raw)
95
+ raw = raw.rstrip("\n")
96
+ #print(raw)
97
+ cells = raw.split(";") # blind split
98
+
99
+ # ---- 1️⃣ repair *title* if dates are not where they belong --
100
+ if (len(cells) > EXPECTED_COLS) and (not is_date(cells[DATE1_COL]) or not is_date(cells[DATE2_COL])) and dataset=="project":
101
+ # look for the first position where *two successive* cells
102
+ # are both valid dates / nulls
103
+ i = DATE1_COL
104
+ while i + 1 < len(cells):
105
+ if is_date(cells[i]) and is_date(cells[i + 1]):
106
+ break
107
+ i += 1
108
+ else:
109
+ # cannot find a valid date pair → give up on this line
110
+ continue
111
+
112
+ head = cells[:TITLE_COL] # 0 … 2
113
+ title = ";".join(cells[TITLE_COL:i]) # glue spill-over
114
+ cells = head + [title] + cells[i:] # rebuild the row
115
+ # ---- 2️⃣ repair *objective* overflow ------------------------
116
+ if len(cells) > EXPECTED_COLS and (dataset=="project" or dataset=="organization"):
117
+ head = cells[:OBJECTIVE_COL]
118
+ tail = cells[-TRAILING_KEEP:]
119
+ obj = ";".join(cells[OBJECTIVE_COL:-TRAILING_KEEP])
120
+ cells = head + [obj] + tail
121
+ #print("here 2")
122
+
123
+ # ---- 3️⃣ pad short rows, skip malformed ---------------------
124
+ if len(cells) < EXPECTED_COLS and (dataset=="project" or dataset=="organization"):
125
+ cells.extend([""] * (EXPECTED_COLS - len(cells)))
126
+ #print("here again")
127
+
128
+ if len(cells) != EXPECTED_COLS and (dataset=="project" or dataset=="organization"): # still wrong → skip
129
+ #print(cells)
130
+ continue
131
+
132
+ # ---- 4️⃣ cell-level clean-ups -------------------------------
133
+ cleaned: list[str] = []
134
+ for cell in cells:
135
+
136
+ if cell in ('""', ""):
137
+ cell = ""
138
+ else:
139
+ cell = (cell.replace("\t", " ")
140
+ .replace('"""', '"')
141
+ .strip())
142
+ if number_regex.fullmatch(cell):
143
+ cell = cell.lstrip("0") or "0"
144
+ cleaned.append(cell.strip('"'))
145
+ cleaned[-1]=cleaned[-1].replace('"','').replace(',','')
146
+ cleaned[0]=cleaned[0].replace('"','')
147
+ writer.writerow(cleaned)
148
+
149
+ # ---------- read into Polars (all Utf8) -------------------------------
150
+ return pl.read_csv(
151
+ tmp_clean,
152
+ separator="|",
153
+ quote_char='"',
154
+ has_header=True,
155
+ infer_schema_length=0,
156
+ null_values=[""],
157
+ truncate_ragged_lines=True,
158
+ )
159
+
160
+
161
+ def combine_all_programmes() -> None:
162
+ from pathlib import Path
163
+ for dataset in DATASETS:
164
+ combined: list[pl.DataFrame] = []
165
+
166
+ for i,programme_dir in enumerate(ROOT.iterdir()):
167
+ if not programme_dir.is_dir():
168
+ continue
169
+ csv_file = programme_dir / f"{dataset}.csv"
170
+ if not csv_file.exists():
171
+ continue
172
+
173
+ regex = _PROJECT_ID_RE if dataset == "project" else _GENERIC_NUM_RE
174
+ df = _clean_one_file(csv_file, regex, dataset)
175
+ print(programme_dir)
176
+ # ---------- type coercions matching your original code ----------
177
+ if dataset == "project":
178
+ df = (
179
+ df
180
+ .with_columns([
181
+ pl.col("id"),#.cast(pl.Int64),
182
+ pl.col("acronym").cast(pl.Utf8, strict=False).str.strip_chars('"'),
183
+ pl.col("status").cast(pl.Utf8, strict=False).str.strip_chars('"'),
184
+ pl.col("title").cast(pl.Utf8, strict=False).str.strip_chars('"'),
185
+ pl.col("legalBasis").cast(pl.Utf8, strict=False).str.strip_chars('"'),
186
+ pl.col("topics").cast(pl.Utf8, strict=False).str.strip_chars('"'),
187
+ pl.col("frameworkProgramme").cast(pl.Utf8, strict=False).str.strip_chars('"'),
188
+ pl.col("masterCall").cast(pl.Utf8, strict=False).str.strip_chars('"'),
189
+ pl.col("subCall").cast(pl.Utf8, strict=False).str.strip_chars('"'),
190
+ pl.col("fundingScheme").cast(pl.Utf8, strict=False).str.strip_chars('"'),
191
+ pl.col("nature").cast(pl.Utf8, strict=False).str.strip_chars('"'),
192
+ pl.col("objective").cast(pl.Utf8, strict=False).str.strip_chars('"'),
193
+ pl.col("grantDoi").cast(pl.Utf8, strict=False).str.strip_chars('"'),
194
+ pl.col("totalCost").cast(pl.Utf8, strict=False).str.strip_chars('"').str.replace_all('"','').str.replace(",",".").cast(pl.Float64),
195
+ pl.col("ecMaxContribution").cast(pl.Utf8, strict=False).str.strip_chars('"').str.replace_all('"','').str.replace(",",".").cast(pl.Float64),
196
+ pl.col("startDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Date, "%Y-%m-%d", strict=False),
197
+ pl.col("endDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Date, "%Y-%m-%d", strict=False),
198
+ pl.col("ecSignatureDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Date, "%Y-%m-%d", strict=False),
199
+ pl.col("contentUpdateDate").cast(pl.Utf8, strict=False).str.strip_chars('"').str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
200
+ pl.col("rcn").cast(pl.Int64),
201
+ ])
202
+ .with_columns(
203
+ pl.lit(programme_dir.name).alias("programmeFolder") # <-- NEW COLUMN
204
+ )
205
+ )
206
+ elif dataset == "organization":
207
+ df = df.with_columns([
208
+ pl.col("contentUpdateDate").cast(pl.Utf8, strict=False).str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
209
+ pl.col("totalCost").cast(pl.Utf8, strict=False).str.replace(",",".").cast(pl.Float64),
210
+ ])
211
+ elif dataset == "projectDeliverables":
212
+ df = df.with_columns([
213
+ #pl.col("projectID").cast(pl.Int64),
214
+ pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
215
+ .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
216
+ ])
217
+ elif dataset == "projectPublications":
218
+ if programme_dir==Path(r"C:\Users\Romain\OneDrive - KU Leuven\Masters\MBIS\Year 2\Semester 2\Modern Data Analytics\CORDIS\H2013"):
219
+ rename_map = {
220
+ "RECORD_ID": "id",
221
+ "TITLE": "title",
222
+ "AUTHOR": "authors",
223
+ "DOI": "doi",
224
+ "PROJECT_ID": "projectID",
225
+ "JOURNAL_TITLE": "journalTitle",
226
+ "PAGES": "publishedPages",
227
+ "PUBLICATION_TYPE": "isPublishedAs",
228
+ }
229
+
230
+ df = df.rename(rename_map)
231
+ else:
232
+ df = df.with_columns([
233
+ pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
234
+ .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
235
+ pl.col("id").cast(pl.Utf8, strict=False)
236
+ .str.extract(r"^(\d+)_", 1)
237
+ #.cast(pl.Int64)
238
+ .alias("projectID"),
239
+ ])
240
+ elif dataset == "reportSummaries":
241
+ df = df.with_columns(
242
+ pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
243
+ .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
244
+ )
245
+ elif dataset == "organization":
246
+ df = df.with_columns([
247
+ pl.col("contentUpdateDate").cast(pl.Utf8, strict=False)
248
+ .str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S", strict=False),
249
+ pl.col("totalCost").cast(pl.Utf8, strict=False)
250
+ .str.replace(",", ".")
251
+ .cast(pl.Float64),
252
+ ])
253
+ elif dataset == "webItem":
254
+ df = df.with_columns(
255
+ pl.col("uri").cast(pl.Utf8, strict=False)
256
+ .str.extract(r"/files/\d+/(\d+)/", 1)
257
+ .cast(pl.Int64)
258
+ .alias("projectID"),
259
+ )
260
+
261
+ # ---------------------------------------------------------------
262
+ combined.append(df)
263
+
264
+ # --------------------------------------------------------------------
265
+ # Write out per-dataset parquet
266
+ # --------------------------------------------------------------------
267
+ if combined:
268
+ how="vertical_relaxed"
269
+ if dataset=="projectPublications":
270
+ how="diagonal"
271
+ result = pl.concat(combined, how=how)
272
+ parquet_path = OUTDIR / f"{dataset}_all.parquet"
273
+ result.write_parquet(parquet_path)
274
+ print(f"✔ {dataset:15s} → {parquet_path}")
275
+
276
+ import pathlib
277
+ import polars as pl
278
+
279
+ ROOT = pathlib.Path(r"C:\Users\Romain\OneDrive - KU Leuven\Masters\MBIS\Year 2\Semester 2\Modern Data Analytics\CORDIS")
280
+ OUTDIR = ROOT / "combined"
281
+ DATASETS = [
282
+ "project",
283
+ "projectDeliverables",
284
+ "projectPublications",
285
+ "reportSummaries",
286
+ "organization",
287
+ "euroSciVoc",
288
+ "topics",
289
+ "webItem",
290
+ "webLink",
291
+ "legalBasis",
292
+ ]
293
+
294
+ dfs = {}
295
+ for dataset in DATASETS:
296
+ path = OUTDIR / f"{dataset}_all.parquet"
297
+ dfs[dataset] = pl.read_parquet(path)
298
+
299
+ projects = dfs["project"]
300
+
301
+ projects_deliv = (
302
+ dfs["projectDeliverables"]
303
+ .group_by("projectID")
304
+ .agg([
305
+ pl.col("deliverableType").alias("list_deliverableType"),
306
+ pl.col("url") .alias("list_url"),
307
+ pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
308
+ ])
309
+ )
310
+
311
+ projects_publi = (
312
+ dfs["projectPublications"]
313
+ .group_by("projectID")
314
+ .agg([
315
+ pl.col("authors") .alias("list_authors"),
316
+ pl.col("title") .alias("list_title"),
317
+ pl.col("doi") .alias("list_doi"),
318
+ pl.col("journalTitle") .alias("list_journalTitle"),
319
+ pl.col("isPublishedAs") .alias("list_isPublishedAs"),
320
+ pl.col("publishedYear") .alias("list_publishedYear"),
321
+ pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
322
+ ])
323
+ )
324
+
325
+ report = (
326
+ dfs["reportSummaries"]
327
+ .group_by("projectID")
328
+ .agg([
329
+ pl.col("title") .alias("list_title"),
330
+ pl.col("attachment") .alias("list_attachment"),
331
+ pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
332
+ ])
333
+ )
334
+
335
+ org = (
336
+ dfs["organization"]
337
+ .group_by("projectID")
338
+ .agg([
339
+ pl.col("organisationID") .alias("list_organisationID"),
340
+ pl.col("country") .alias("list_country"),
341
+ pl.col("name") .alias("list_name"),
342
+ pl.col("SME") .alias("list_SME"),
343
+ pl.col("city") .alias("list_city"),
344
+ pl.col("geolocation") .alias("list_geolocation"),
345
+ pl.col("organizationURL") .alias("list_organizationURL"),
346
+ pl.col("role") .alias("list_role"),
347
+ pl.col("ecContribution") .alias("list_ecContribution"),
348
+ pl.col("netEcContribution").alias("list_netEcContribution"),
349
+ pl.col("totalCost") .alias("list_totalCost"),
350
+ pl.col("endOfParticipation").alias("list_endOfParticipation"),
351
+ pl.col("activityType") .alias("list_activityType"),
352
+ pl.col("contentUpdateDate").alias("list_contentUpdateDate"),
353
+ ])
354
+ )
355
+
356
+ voc = (
357
+ dfs["euroSciVoc"]
358
+ .group_by("projectID")
359
+ .agg([
360
+ pl.col("euroSciVocTitle") .alias("list_euroSciVocTitle"),
361
+ pl.col("euroSciVocPath") .alias("list_euroSciVocPath"),
362
+ pl.col("euroSciVocDescription").alias("list_description"),
363
+ ])
364
+ )
365
+
366
+ topic = (
367
+ dfs["topics"]
368
+ .group_by("projectID")
369
+ .agg([
370
+ pl.col("topic") .alias("list_topic"),
371
+ pl.col("title") .alias("list_title"),
372
+ ])
373
+ )
374
+
375
+ web_item = dfs["webItem"] # no aggregation
376
+
377
+ web_link = (
378
+ dfs["webLink"]
379
+ .group_by("projectID")
380
+ .agg([
381
+ pl.col("physUrl") .alias("list_physUrl"),
382
+ pl.col("availableLanguages") .alias("list_availableLanguages"),
383
+ pl.col("status") .alias("list_status"),
384
+ pl.col("archivedDate") .alias("list_archivedDate"),
385
+ pl.col("type") .alias("list_type"),
386
+ pl.col("source") .alias("list_source"),
387
+ pl.col("represents") .alias("list_represents"),
388
+ ])
389
+ )
390
+
391
+ legal = (
392
+ dfs["legalBasis"]
393
+ .group_by("projectID")
394
+ .agg([
395
+ pl.col("legalBasis") .alias("list_legalBasis"),
396
+ pl.col("title") .alias("list_title"),
397
+ pl.col("uniqueProgrammePart").alias("list_uniqueProgrammePart"),
398
+ ])
399
+ )
400
+
401
+ consolidated = (
402
+ projects
403
+ .join(projects_deliv, left_on="id", right_on="projectID", suffix="_deliv", how="left")
404
+ .join(projects_publi, left_on="id", right_on="projectID", suffix="_publi", how="left")
405
+ .join(report, left_on="id", right_on="projectID", suffix="_report", how="left")
406
+ .join(org, left_on="id", right_on="projectID", suffix="_org", how="left")
407
+ .join(web_link, left_on="id", right_on="projectID", suffix="_link", how="left")
408
+ .join(legal, left_on="id", right_on="projectID", suffix="_legal", how="left")
409
+ .join(topic, left_on="id", right_on="projectID", suffix="_topic", how="left")
410
+ .join(voc, left_on="id", right_on="projectID", suffix="_voc", how="left")
411
+ )
412
+
413
+ for col in ["startDate", "endDate"]:
414
+ if consolidated[col].dtype == pl.Utf8:
415
+ consolidated = consolidated.with_column(
416
+ pl.col(col).str.strptime(pl.Date, "%Y-%m-%d").alias(col)
417
+ )
418
+
419
+ consolidated = consolidated.with_columns(
420
+ pl.col("list_netEcContribution").list.eval(pl.element().cast(pl.Float64),parallel=True)
421
+ .list.sum().alias("netEcContribution")
422
+ )
423
+
424
+ consolidated = consolidated.with_columns(
425
+ pl.col("totalCost").cast(pl.Float64),
426
+ pl.col("netEcContribution").cast(pl.Float64)
427
+ )
428
+
429
+ consolidated = consolidated.with_columns([
430
+ pl.col("startDate").dt.year().alias("startYear"),
431
+ pl.col("endDate"). dt.year().alias("endYear"),
432
+ (pl.col("endDate") - pl.col("startDate")).dt.total_days().alias("durationDays"),
433
+ (pl.col("netEcContribution") / pl.col("totalCost")).alias("ecRatio"),
434
+ ])
435
+
436
+ consolidated.write_parquet(OUTDIR / "consolidated.parquet")
437
+
438
+ excluded_frameworks = ["FP1", "FP2", "FP3", "FP4", "FP5", "FP6"]
439
+
440
+ consolidated_clean = (consolidated.filter(~pl.col("frameworkProgramme").is_in(excluded_frameworks)))
441
+
442
+ consolidated_clean.write_parquet(OUTDIR / "consolidated_clean.parquet")
predictive_modelling.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import joblib
4
+ import numpy as np
5
+ import pandas as pd
6
+ import shap
7
+ import matplotlib.pyplot as plt
8
+ import scipy.sparse
9
+
10
+ from sklearn.base import BaseEstimator, TransformerMixin
11
+ from sklearn.pipeline import Pipeline as SKPipeline
12
+ from sklearn.compose import ColumnTransformer
13
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
14
+ from sklearn.impute import SimpleImputer
15
+ from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
16
+ from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
17
+ from sklearn.metrics import classification_report, ConfusionMatrixDisplay, f1_score, make_scorer
18
+ from sklearn.decomposition import TruncatedSVD
19
+ from sklearn.calibration import CalibratedClassifierCV
20
+ from sklearn.ensemble import IsolationForest
21
+
22
+ from imblearn.pipeline import Pipeline as ImbPipeline
23
+ from imblearn.over_sampling import ADASYN
24
+
25
+ from sentence_transformers import SentenceTransformer
26
+ from xgboost import XGBClassifier
27
+
28
+ from evidently import Report
29
+ from evidently.presets import DataDriftPreset
30
+
31
+ import optuna
32
+
33
+ # --- Custom Transformers ---
34
+ class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
35
+ def fit(self, X,y=None):
36
+ self.col = X.name
37
+ self.mlb = MultiLabelBinarizer()
38
+ self.mlb.fit(X)
39
+ return self
40
+ def transform(self, X):
41
+ return self.mlb.transform(X)
42
+ def get_feature_names_out(self, input_features=None):
43
+ return [f"{self.col}_{cls}" for cls in self.mlb.classes_]
44
+ def get_params(self, deep=True):
45
+ return {}
46
+ def set_params(self, **params):
47
+ return self
48
+
49
+ class AnomalyScoreTransformer(BaseEstimator, TransformerMixin):
50
+ def __init__(self):
51
+ self.model = IsolationForest(n_estimators=200, contamination=0.1, random_state=42)
52
+
53
+ def fit(self, X, y=None):
54
+ self.model.fit(X)
55
+ return self
56
+
57
+ def transform(self, X):
58
+ scores = -self.model.decision_function(X)
59
+ return np.hstack([X, scores.reshape(-1, 1)])
60
+
61
+ # --- Step 1: Data Preparation ---
62
+ def prepare_data(df, is_train=True, model_dir="model_artifacts"):
63
+ df = df.copy()
64
+
65
+ if is_train:
66
+ df['status'] = df['status'].astype(str).str.upper()
67
+ df = df[df['status'].isin(['CLOSED', 'TERMINATED'])]
68
+ df['label'] = df['status'].map({'CLOSED': 0, 'TERMINATED': 1})
69
+ assert df['label'].notna().all(), "Label column still has NaNs!"
70
+
71
+ multilabel_fields = [
72
+ 'list_country', 'list_activityType', 'list_deliverableType',
73
+ 'list_availableLanguages', 'list_euroSciVocTitle'
74
+ ]
75
+
76
+ def extract_intermediate_levels(paths):
77
+ tokens = []
78
+ if isinstance(paths, list):
79
+ for p in paths:
80
+ parts = p.strip('/').split('/')
81
+ tokens.extend(parts[:-1])
82
+ return list(set(tokens))
83
+ df['euroSciVoc_intermediate'] = df['list_euroSciVocPath'].apply(extract_intermediate_levels)
84
+ multilabel_fields.append('euroSciVoc_intermediate')
85
+
86
+ for col in multilabel_fields:
87
+ df[col] = df[col].apply(lambda x: [] if x is None else (x.tolist() if hasattr(x, 'tolist') else x))
88
+ df[col] = df[col].apply(lambda x: list(x) if not isinstance(x, list) else x)
89
+ df[col] = df[col].apply(lambda x: [item for item in x if item is not None])
90
+ df[col] = df[col].apply(lambda x: [str(item).upper() for item in x])
91
+
92
+
93
+ def split_languages(lang_list):
94
+ if not isinstance(lang_list, list):
95
+ return []
96
+ result = []
97
+ for entry in lang_list:
98
+ if isinstance(entry, str):
99
+ result.extend(entry.split(","))
100
+ return result
101
+
102
+ df["list_availableLanguages"] = df["list_availableLanguages"].apply(split_languages)
103
+
104
+
105
+ for col in ['title', 'objective']:
106
+ df[col] = df[col].fillna("").astype(str)
107
+
108
+ df['n_partners'] = df['list_name'].apply(
109
+ lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)
110
+ )
111
+
112
+ df['n_country'] = df['list_country'].apply(
113
+ lambda x: len(x.tolist()) if x is not None and hasattr(x, 'tolist') else (len(x) if isinstance(x, list) else 0)
114
+ )
115
+
116
+ df['n_sme'] = df['list_SME'].apply(
117
+ lambda x: sum(1 for i in (x.tolist() if hasattr(x, 'tolist') else x) if i is True)
118
+ if x is not None and (hasattr(x, 'tolist') or isinstance(x, list)) else 0
119
+ )
120
+
121
+ return df
122
+
123
+ # --- Step 2: Text Embedding ---
124
+ def compute_embeddings(df, text_columns, model_name='sentence-transformers/LaBSE', svd_dim=50):
125
+ model = SentenceTransformer(model_name)
126
+ os.makedirs("/content/drive/MyDrive/model_artifacts", exist_ok=True)
127
+ os.makedirs("/content/drive/MyDrive/embeddings", exist_ok=True)
128
+ for col in text_columns:
129
+ embedding_file = f"/content/drive/MyDrive/embeddings/{col}_embeddings.npy"
130
+ svd_file = f"/content/drive/MyDrive/model_artifacts/{col}_svd.pkl"
131
+ if os.path.exists(embedding_file):
132
+ print(f"Loading saved embeddings for column '{col}'...")
133
+ embeddings = np.load(embedding_file)
134
+ else:
135
+ print(f"Computing embeddings for column '{col}'...")
136
+ embeddings = model.encode(df[col].tolist(), show_progress_bar=True)
137
+ np.save(embedding_file, embeddings)
138
+
139
+ print(f"Fitting SVD for column '{col}'...")
140
+ svd = TruncatedSVD(n_components=svd_dim, random_state=42)
141
+ svd.fit(embeddings)
142
+ joblib.dump(svd, svd_file)
143
+
144
+ reduced = svd.transform(embeddings)
145
+ embed_df = pd.DataFrame(reduced, columns=[f'{col}_embed_{i}' for i in range(reduced.shape[1])])
146
+ embed_df.index = df.index # Force matching index
147
+ df = pd.concat([df, embed_df], axis=1)
148
+ return df
149
+
150
+
151
+ # --- Step 3: Build Preprocessor ---
152
+ def build_preprocessor(numeric_features, categorical_features, multilabel_fields):
153
+ numeric_pipeline = SKPipeline([
154
+ ('imputer', SimpleImputer(strategy='median')),
155
+ ('scaler', StandardScaler())], memory="cache_dir"
156
+ )
157
+
158
+ categorical_pipeline = SKPipeline([
159
+ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
160
+ ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))], memory="cache_dir"
161
+ )
162
+
163
+ transformers = [
164
+ ('num', numeric_pipeline, numeric_features),
165
+ ('cat', categorical_pipeline, categorical_features),
166
+ *[(f'mlb_{col}', MultiLabelBinarizerTransformer(), col) for col in multilabel_fields]]
167
+
168
+
169
+ return ColumnTransformer(transformers, sparse_threshold=0.0)
170
+
171
+ # --- Step 4: Build Pipeline ---
172
+ def build_pipeline(preprocessor, base_model, k=250):
173
+ return ImbPipeline(steps=[
174
+ ('preprocessor', preprocessor),
175
+ ('anomaly', AnomalyScoreTransformer()),
176
+ ('resample', ADASYN()),
177
+ ("variance_filter", VarianceThreshold(threshold=0.0)),
178
+ ('feature_select', SelectKBest(score_func=f_classif, k=k)),
179
+ ('classifier', CalibratedClassifierCV(estimator=base_model, method='isotonic', cv=3))
180
+ ])
181
+
182
+ # --- Step 5: Drift Monitoring ---
183
+ def monitor_drift(reference, current, feature_names, output_html='drift_report.html'):
184
+ ref_df = pd.DataFrame(reference, columns=feature_names)
185
+ cur_df = pd.DataFrame(current, columns=feature_names)
186
+
187
+ report = Report(metrics=[DataDriftPreset()])
188
+ report.run(reference_data=ref_df, current_data=cur_df)
189
+ report.save_html(output_html)
190
+ print(f"✅ Drift report saved to {output_html}")
191
+
192
+
193
+ # --- Step 6: Evaluation + SHAP ---
194
+ def evaluate_model(model, X_train, X_test, y_train, y_test, feature_names):
195
+ model.fit(X_train, y_train)
196
+ y_pred = model.predict(X_test)
197
+ print(classification_report(y_test, y_pred))
198
+ ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
199
+ plt.title("Evaluation")
200
+ plt.tight_layout()
201
+ plt.show()
202
+
203
+ X_proc = model.named_steps['preprocessor'].transform(X_test)
204
+ if scipy.sparse.issparse(X_proc):
205
+ X_proc = X_proc.toarray()
206
+
207
+ selector = model.named_steps['feature_select']
208
+ X_selected = selector.transform(X_proc)
209
+
210
+ explainer = shap.Explainer(model.named_steps['classifier'].base_estimator, feature_names=feature_names)
211
+ shap_values = explainer(X_selected)
212
+ shap.summary_plot(shap_values, X_selected)
213
+
214
+ # --- Final Orchestration ---
215
+ def status_prediction_model(df):
216
+ os.makedirs("model_artifacts", exist_ok=True)
217
+ print("🧹 Preparing data...")
218
+ df = prepare_data(df, is_train=True)
219
+ print("💡 Embedding text...")
220
+ df = compute_embeddings(df, ['title', 'objective'])
221
+
222
+ text_embed_cols = [col for col in df.columns if '_embed_' in col]
223
+ numeric_features = ['durationDays', 'ecMaxContribution', 'totalCost',
224
+ 'n_partners', 'n_country', 'n_sme'] + text_embed_cols
225
+ categorical_features = ['fundingScheme', 'legalBasis', 'nature']
226
+ multilabel_fields = ['list_country', 'list_activityType', 'list_deliverableType',
227
+ 'list_availableLanguages', 'list_euroSciVocTitle','euroSciVoc_intermediate']
228
+
229
+
230
+ df = df[numeric_features + categorical_features + multilabel_fields + ['label']]
231
+ X = df.drop(columns='label')
232
+ y = df['label']
233
+
234
+
235
+ X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
236
+
237
+ print("🧱 Building pipeline...")
238
+ preprocessor = build_preprocessor(numeric_features, categorical_features, multilabel_fields)
239
+ base_model = XGBClassifier(eval_metric='logloss', n_jobs=-1)
240
+
241
+ print("🎯 Training model with Optuna...")
242
+ def objective(trial):
243
+ params = {
244
+ 'n_estimators': trial.suggest_int('n_estimators', 100, 300),
245
+ 'max_depth': trial.suggest_int('max_depth', 3, 10),
246
+ 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
247
+ 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2.0, 10.0)
248
+ }
249
+ base_model.set_params(**params)
250
+ pipeline = build_pipeline(preprocessor, base_model)
251
+ scores = cross_val_score(pipeline, X, y, cv=StratifiedKFold(3, shuffle=True, random_state=42),
252
+ scoring=make_scorer(f1_score, pos_label=1),n_jobs=-1)
253
+ return scores.mean()
254
+
255
+ study = optuna.create_study(direction='maximize')
256
+ study.optimize(objective, n_trials=24,n_jobs=6)
257
+ best_params = study.best_trial.params
258
+ base_model.set_params(**best_params)
259
+
260
+ print("✅ Training final model and evaluating...")
261
+ final_pipeline = build_pipeline(preprocessor, base_model)
262
+ selector = final_pipeline.named_steps['feature_select']
263
+ if hasattr(selector, 'get_support'):
264
+ feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())[selector.get_support()]
265
+ else:
266
+ feature_names = np.array(final_pipeline.named_steps['preprocessor'].get_feature_names_out())
267
+ evaluate_model(final_pipeline, X_train, X_test, y_train, y_test, feature_names)
268
+
269
+ print("📊 Monitoring drift...")
270
+ ref_data = preprocessor.transform(X_train)
271
+ cur_data = preprocessor.transform(X_test)
272
+ if scipy.sparse.issparse(ref_data): ref_data = ref_data.toarray()
273
+ if scipy.sparse.issparse(cur_data): cur_data = cur_data.toarray()
274
+ monitor_drift(pd.DataFrame(ref_data), pd.DataFrame(cur_data), feature_names)
275
+ print("💾 Saving model and artifacts...")
276
+ joblib.dump(final_pipeline, "model_artifacts/model.pkl")
277
+ joblib.dump(preprocessor, "model_artifacts/preprocessor.pkl")
278
+ X_train.to_csv("model_artifacts/X_train_processed.csv", index=False)
279
+ y_train.to_csv("model_artifacts/y_train.csv", index=False)
280
+ feature_config = {
281
+ "numeric_features": numeric_features,
282
+ "categorical_features": categorical_features,
283
+ "multilabel_fields": multilabel_fields
284
+ }
285
+ json.dump(feature_config, open("model_artifacts/feature_config.json", "w"))
286
+ print("✅ Training complete. Model artifacts saved.")
287
+
288
+ def score(new_df, model_dir="model_artifacts"):
289
+ # 1) Load artifacts
290
+ pipe = joblib.load(os.path.join(model_dir, "model.pkl"))
291
+ config = json.load(open(os.path.join(model_dir, "feature_config.json")))
292
+
293
+ # 2) Prepare & embed exactly as in training
294
+ df = prepare_data(new_df.copy(), is_train=False)
295
+ text_cols = ['title', 'objective']
296
+ sbert = SentenceTransformer('sentence-transformers/LaBSE')
297
+ for col in text_cols:
298
+ # load the SVD you trained
299
+ svd = joblib.load(os.path.join(model_dir, f"{col}_svd.pkl"))
300
+ emb = sbert.encode(df[col].tolist(), show_progress_bar=False)
301
+ reduced = svd.transform(emb)
302
+ emb_df = pd.DataFrame(reduced,
303
+ columns=[f"{col}_embed_{i}" for i in range(reduced.shape[1])],
304
+ index=df.index)
305
+ df = pd.concat([df, emb_df], axis=1)
306
+
307
+ # 3) Build the final feature set
308
+ X = df[ config["numeric_features"]
309
+ + config["categorical_features"]
310
+ + config["multilabel_fields"] ]
311
+
312
+ # 4) Predict & attach to DataFrame
313
+ preds = pipe.predict(X)
314
+ probs = pipe.predict_proba(X)[:, 1] # assume binary and positive class = index 1
315
+ df["predicted_label"] = preds
316
+ df["predicted_prob"] = probs
317
+
318
+ # 5) SHAP explanations on the *selected* features
319
+ # (we need to re-run preprocessing + feature_selection)
320
+ preproc = pipe.named_steps["preprocessor"]
321
+ select = pipe.named_steps["feature_select"]
322
+ clf = pipe.named_steps["classifier"].base_estimator
323
+
324
+ X_proc = preproc.transform(X)
325
+ if scipy.sparse.issparse(X_proc):
326
+ X_proc = X_proc.toarray()
327
+ X_sel = select.transform(X_proc)
328
+
329
+ feature_names = select.get_feature_names_out(
330
+ preproc.get_feature_names_out()
331
+ )
332
+
333
+ # Use a TreeExplainer directly on the XGB base estimator
334
+ explainer = shap.Explainer(clf, X_sel, feature_names=feature_names)
335
+ shap_vals = explainer(X_sel) # returns a ShapleyValues object
336
+
337
+ # 6) For each row, pick top-3 absolute contributors
338
+ shap_df = pd.DataFrame(shap_vals.values, columns=feature_names, index=df.index)
339
+ abs_shap = shap_df.abs()
340
+
341
+ top_feats = abs_shap.apply(lambda row: row.nlargest(4).index.tolist(), axis=1)
342
+ top_vals = abs_shap.apply(lambda row: row.nlargest(4).values.tolist(), axis=1)
343
+
344
+ df[["top1_feature","top2_feature","top3_feature","top4_feature"]] = pd.DataFrame(
345
+ top_feats.tolist(), index=df.index
346
+ )
347
+ df[["top1_shap","top2_shap","top3_shap","top4_shap"]] = pd.DataFrame(
348
+ top_vals.tolist(), index=df.index
349
+ )
350
+
351
+ return df
352
+
353
+ if __name__ == "__main__":
354
+ df = pd.read_csv("your_data.csv")
355
+
356
+ status_prediction_model(df)
357
+
358
+ new_df = pd.read_csv("new_data.csv")
359
+ scored_df = score(new_df)
360
+ print(scored_df.head())
rag_test.py DELETED
@@ -1,223 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import faiss
5
- import pickle
6
- import spacy
7
- import re
8
- from sentence_transformers import SentenceTransformer
9
- from langchain.prompts import PromptTemplate
10
- from langchain.llms import OpenAI
11
- import matplotlib.pyplot as plt
12
-
13
- # --- Load Models and Data ---
14
- @st.cache_resource
15
- def load_models_and_data():
16
- nlp = spacy.load("en_core_web_sm")
17
- embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
18
-
19
- projects_df = pd.read_parquet("cordis_projects.parquet")
20
- with open("topic_summaries.pkl", "rb") as f:
21
- topic_summaries_df = pickle.load(f)
22
-
23
- project_index = faiss.read_index("project_chunks.faiss")
24
- topic_index = faiss.read_index("topic_summary_index.faiss")
25
-
26
- return nlp, embedding_model, projects_df, topic_summaries_df, project_index, topic_index
27
-
28
- nlp, embedding_model, projects_df, topic_summaries_df, project_index, topic_index = load_models_and_data()
29
-
30
- # --- RAG Components ---
31
-
32
- class SessionContext:
33
- def __init__(self):
34
- self.last_query_type = None
35
- self.last_entity = None
36
- self.last_projects_df = None
37
- self.last_project_id = None
38
-
39
- def update(self, query_type, entity, df=None, project_id=None):
40
- self.last_query_type = query_type
41
- self.last_entity = entity
42
- self.last_projects_df = df
43
- self.last_project_id = project_id
44
-
45
- session_context = SessionContext()
46
-
47
- def classify_query_type(query: str) -> str:
48
- query = query.lower()
49
- if re.search(r'\b(project|grant agreement|projectid|rcn|ga)\b', query):
50
- return "project"
51
- if re.search(r'\b(topic|eurovoc|euro-scivoc)\b', query):
52
- return "topic"
53
- if re.search(r'\b(organization|institution|company|beneficiary)\b', query):
54
- return "organization"
55
- if re.search(r'\b(legalbasis|legislation|h2020|fp7)\b', query):
56
- return "legalBasis"
57
- return "general"
58
-
59
- def extract_entities_custom(query: str) -> dict:
60
- entities = {"project_id": None, "organization": None, "topic": None}
61
- doc = nlp(query)
62
- match = re.search(r"\b\d{6,8}\b", query)
63
- if match:
64
- entities["project_id"] = match.group(0)
65
- for ent in doc.ents:
66
- if ent.label_ == "ORG":
67
- entities["organization"] = ent.text
68
- if ent.label_ == "MISC":
69
- entities["topic"] = ent.text
70
- return entities
71
-
72
- def retrieve_project_chunks_by_id(project_id: str) -> list:
73
- subset = projects_df[projects_df['project_id'] == project_id]
74
- return subset['chunk_text'].tolist()
75
-
76
- def retrieve_topic_summary_from_project(project_id: str) -> str:
77
- row = projects_df[projects_df['project_id'] == project_id]
78
- if row.empty:
79
- return ""
80
- topic = row['topic_path'].values[0]
81
- match = topic_summaries_df[topic_summaries_df['topic'] == topic]
82
- return match['summary'].values[0] if not match.empty else ""
83
-
84
- def get_kpi_context() -> str:
85
- return (
86
- "Average project duration: 780 days\n"
87
- "Termination rate: 12.3%\n"
88
- "Top countries by project count: Germany, France, Italy\n"
89
- "Common termination reasons: coordination failure, underperformance"
90
- )
91
-
92
- def run_reasoning(question: str, df: pd.DataFrame) -> str:
93
- q = question.lower()
94
- if "average" in q and "funding" in q:
95
- avg = df['ecMaxContribution'].mean()
96
- return f"The average funding is €{avg:,.2f}."
97
- return ""
98
-
99
- def generate_follow_up_suggestions(query_type: str) -> str:
100
- if query_type == "project":
101
- return "Would you like more info on related topics or organizations involved?"
102
- if query_type == "organization":
103
- return "Want to explore more projects from this organization?"
104
- if query_type == "topic":
105
- return "Would you like to see top projects under this topic?"
106
- return ""
107
-
108
- def is_follow_up_question(question: str) -> bool:
109
- q = question.lower()
110
- return any(phrase in q for phrase in [
111
- "they", "those", "what do they", "what are they about", "explain them", "what's the topic"
112
- ])
113
-
114
- def summarize_projects(df: pd.DataFrame, field="objective", top_n=5) -> str:
115
- texts = df[field].dropna().tolist()[:top_n]
116
- if not texts:
117
- return "No objectives or summaries available for these projects."
118
- combined_text = "\n\n".join(texts)
119
- prompt = f"Summarize what these projects are generally about:\n\n{combined_text}"
120
- return OpenAI(temperature=0)(prompt)
121
-
122
- rag_template = PromptTemplate(
123
- input_variables=["project_info", "topic_summary", "kpi_context", "programmatic", "question", "followup"],
124
- template="""
125
- You are a research assistant for a European funding agency. Use the provided context to answer the user query.
126
-
127
- --- Project Info ---
128
- {project_info}
129
-
130
- --- Topic Summary ---
131
- {topic_summary}
132
-
133
- --- KPI Context ---
134
- {kpi_context}
135
-
136
- --- Data Insights ---
137
- {programmatic}
138
-
139
- --- Question ---
140
- {question}
141
-
142
- --- Answer ---
143
-
144
- Also consider:
145
- {followup}
146
- """
147
- )
148
-
149
- def chat_with_context(question: str, session: SessionContext) -> str:
150
- if is_follow_up_question(question) and session.last_projects_df is not None:
151
- return summarize_projects(session.last_projects_df)
152
-
153
- query_type = classify_query_type(question)
154
- entities = extract_entities_custom(question)
155
- project_info = topic_summary = programmatic = ""
156
- relevant_df = projects_df.copy()
157
- project_id = None
158
-
159
- if entities["project_id"]:
160
- project_id = entities["project_id"]
161
- project_info = "\n".join(retrieve_project_chunks_by_id(project_id))
162
- topic_summary = retrieve_topic_summary_from_project(project_id)
163
- relevant_df = projects_df[projects_df['project_id'] == project_id]
164
-
165
- elif query_type == "organization":
166
- org = entities.get("organization")
167
- if org:
168
- relevant_df = projects_df[projects_df['list_name'].str.contains(org, case=False, na=False)]
169
- project_info = "\n".join(relevant_df['chunk_text'].head(3))
170
-
171
- programmatic = run_reasoning(question, relevant_df)
172
- kpi_context = get_kpi_context()
173
- followup = generate_follow_up_suggestions(query_type)
174
-
175
- prompt = rag_template.format(
176
- project_info=project_info,
177
- topic_summary=topic_summary,
178
- kpi_context=kpi_context,
179
- programmatic=programmatic,
180
- question=question,
181
- followup=followup
182
- )
183
-
184
- session.update(query_type, entities.get("organization") or entities.get("project_id"), relevant_df, project_id)
185
-
186
- return OpenAI(temperature=0)(prompt)
187
-
188
- # --- Streamlit UI ---
189
-
190
- st.set_page_config(page_title="EU Funding Explorer", layout="wide")
191
- st.title("🇪🇺 EU Projects Dashboard")
192
-
193
- tabs = st.tabs(["📊 Dashboard", "📁 Projects + Chatbot"])
194
-
195
- # --- Tab 1: Dashboard ---
196
- with tabs[0]:
197
- st.subheader("Funding Overview")
198
- funding_by_year = projects_df.groupby("startYear")["ecMaxContribution"].sum().reset_index()
199
- plt.figure(figsize=(10,4))
200
- plt.bar(funding_by_year["startYear"], funding_by_year["ecMaxContribution"] / 1e6)
201
- plt.ylabel("Total Funding (€M)")
202
- st.pyplot(plt)
203
-
204
- top_orgs = projects_df["list_name"].value_counts().head(10)
205
- st.bar_chart(top_orgs)
206
-
207
- # --- Tab 2: Projects + Chatbot ---
208
- with tabs[1]:
209
- left, right = st.columns([2, 1])
210
-
211
- with left:
212
- st.subheader("Browse Projects")
213
- page_size = 10
214
- page_num = st.number_input("Page", min_value=0, max_value=(len(projects_df) // page_size), step=1)
215
- paginated = projects_df.iloc[page_num * page_size : (page_num + 1) * page_size]
216
- st.dataframe(paginated[["title", "list_name", "ecMaxContribution", "startYear", "status"]], use_container_width=True)
217
-
218
- with right:
219
- st.subheader("Ask the Chatbot")
220
- user_input = st.text_input("Ask a question...")
221
- if st.button("Ask") and user_input:
222
- answer = chat_with_context(user_input, session_context)
223
- st.markdown(answer)