|
""" |
|
Streamlit application for extracting site and sector information from .docx design files. |
|
The logic is adapted from `Sector Stacked.py` but provides an interactive UI where users can |
|
upload one or many Word documents and instantly visualise / download the results. |
|
""" |
|
|
|
import io |
|
import os |
|
import re |
|
from typing import List |
|
|
|
import pandas as pd |
|
import plotly.express as px |
|
import streamlit as st |
|
from docx import Document |
|
from streamlit.commands.execution_control import rerun |
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_info_from_docx_separated_sectors( |
|
docx_bytes: bytes, filename: str |
|
) -> List[dict]: |
|
"""Extract the site-level and sector-level information from a Word design file. |
|
|
|
Parameters |
|
---------- |
|
docx_bytes : bytes |
|
Raw bytes of the `.docx` file – read directly from the Streamlit uploader. |
|
filename : str |
|
Original filename. Used only for reference in the output. |
|
|
|
Returns |
|
------- |
|
list[dict] |
|
A list containing up to three dictionaries – one for each sector. |
|
""" |
|
|
|
doc = Document(io.BytesIO(docx_bytes)) |
|
|
|
|
|
site_shared = { |
|
"File": filename, |
|
"Code": None, |
|
"Site Name": None, |
|
"Localité": None, |
|
"Adresse": None, |
|
"X": None, |
|
"Y": None, |
|
"Z": None, |
|
"UTM_Zone": None, |
|
} |
|
|
|
|
|
sector_data = { |
|
"Azimuth": [None] * 3, |
|
"Height": [None] * 3, |
|
"MechTilt": [None] * 3, |
|
"ElecTilt": [None] * 3, |
|
} |
|
|
|
|
|
for table in doc.tables: |
|
for row in table.rows: |
|
|
|
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()] |
|
if not cells: |
|
continue |
|
|
|
row_text_lower = " | ".join(cells).lower() |
|
|
|
|
|
if site_shared["Code"] is None and any("code" in c.lower() for c in cells): |
|
for val in cells: |
|
if ("t00" in val.lower()) or ("n01" in val.lower()): |
|
site_shared["Code"] = val.replace(" ", "").strip() |
|
break |
|
|
|
|
|
if site_shared["Site Name"] is None and any( |
|
"nom" in c.lower() for c in cells |
|
): |
|
for val in cells: |
|
if ("t00" in val.lower()) or ("n01" in val.lower()): |
|
site_shared["Site Name"] = val.strip() |
|
break |
|
|
|
|
|
if site_shared["UTM_Zone"] is None: |
|
utm_match = re.search(r"utm\s*(\d+)", row_text_lower) |
|
if utm_match: |
|
site_shared["UTM_Zone"] = f"UTM{utm_match.group(1)}" |
|
|
|
|
|
if site_shared["Localité"] is None and any( |
|
"localité" in c.lower() for c in cells |
|
): |
|
for val in cells: |
|
if val.lower() != "localité:": |
|
site_shared["Localité"] = val.strip() |
|
break |
|
if site_shared["Adresse"] is None and any( |
|
"adresse" in c.lower() for c in cells |
|
): |
|
for val in cells: |
|
if val.lower() != "adresse:": |
|
site_shared["Adresse"] = val.strip() |
|
break |
|
|
|
|
|
if {"X", "Y", "Z"}.intersection(cells): |
|
for i, cell_text in enumerate(cells): |
|
text = cell_text.strip() |
|
|
|
if text == "X" and i + 1 < len(cells): |
|
site_shared["X"] = cells[i + 1].strip() |
|
|
|
elif re.search(r"Y\s*[0-9]", text): |
|
match = re.search(r"Y\s*([0-9°'\.\sWE]+)", text) |
|
if match: |
|
site_shared["Y"] = match.group(1).strip() |
|
elif text == "Y" and i + 1 < len(cells): |
|
site_shared["Y"] = cells[i + 1].strip() |
|
|
|
elif re.search(r"Z\s*[0-9]", text): |
|
match = re.search(r"Z\s*([0-9]+)", text) |
|
if match: |
|
site_shared["Z"] = match.group(1).strip() |
|
elif text == "Z" and i + 1 < len(cells): |
|
z_val = re.search(r"([0-9]+)", cells[i + 1]) |
|
if z_val: |
|
site_shared["Z"] = z_val.group(1).strip() |
|
|
|
|
|
first_cell = cells[0].lower() |
|
if first_cell == "azimut": |
|
for i in range(min(3, len(cells) - 1)): |
|
sector_data["Azimuth"][i] = cells[i + 1] |
|
elif "hauteur des aériens" in first_cell: |
|
for i in range(min(3, len(cells) - 1)): |
|
sector_data["Height"][i] = cells[i + 1] |
|
elif "tilt mécanique" in first_cell: |
|
for i in range(min(3, len(cells) - 1)): |
|
sector_data["MechTilt"][i] = cells[i + 1] |
|
elif "tilt électrique" in first_cell: |
|
for i in range(min(3, len(cells) - 1)): |
|
sector_data["ElecTilt"][i] = cells[i + 1] |
|
|
|
|
|
rows: List[dict] = [] |
|
for sector_id in range(3): |
|
if sector_data["Azimuth"][sector_id]: |
|
rows.append( |
|
{ |
|
**site_shared, |
|
"Sector ID": sector_id + 1, |
|
"Azimuth": sector_data["Azimuth"][sector_id], |
|
"Height": sector_data["Height"][sector_id], |
|
"MechTilt": sector_data["MechTilt"][sector_id], |
|
"ElecTilt": sector_data["ElecTilt"][sector_id], |
|
} |
|
) |
|
return rows |
|
|
|
|
|
def convert_coord_to_decimal(coord: str, default_direction: str | None = None): |
|
"""Convert coordinate strings containing degrees/minutes/seconds to decimal degrees. |
|
|
|
Handles various formats, e.g. "3° 33' 12.4\" W", "3 33 12.4 O", "-3.5534", "3.5534E". |
|
West (W/O) or South (S) are returned as negative values. |
|
Returns None if conversion fails. |
|
""" |
|
|
|
if coord is None or (isinstance(coord, float) and pd.isna(coord)): |
|
return None |
|
|
|
|
|
text = str(coord).replace(",", ".").strip() |
|
if not text: |
|
return None |
|
|
|
|
|
direction = None |
|
match_dir = re.search(r"([NSEWnsewOo])", text) |
|
if match_dir: |
|
direction = match_dir.group(1).upper() |
|
text = text.replace(match_dir.group(1), "") |
|
else: |
|
|
|
if default_direction is not None: |
|
direction = default_direction.upper() |
|
|
|
|
|
nums = re.findall(r"[-+]?(?:\d+\.?\d*)", text) |
|
if not nums: |
|
return None |
|
|
|
|
|
nums_f = [float(n) for n in nums] |
|
|
|
|
|
if len(nums_f) >= 3: |
|
deg, minute, sec = nums_f[0], nums_f[1], nums_f[2] |
|
dec = deg + minute / 60 + sec / 3600 |
|
elif len(nums_f) == 2: |
|
deg, minute = nums_f[0], nums_f[1] |
|
dec = deg + minute / 60 |
|
else: |
|
dec = nums_f[0] |
|
|
|
|
|
if direction in {"W", "O", "S"}: |
|
dec = -abs(dec) |
|
|
|
return dec |
|
|
|
|
|
def process_files_to_dataframe(uploaded_files) -> pd.DataFrame: |
|
"""Run extraction on the uploaded files and return a concatenated dataframe.""" |
|
all_rows: List[dict] = [] |
|
for uploaded in uploaded_files: |
|
rows = extract_info_from_docx_separated_sectors(uploaded.read(), uploaded.name) |
|
all_rows.extend(rows) |
|
df = pd.DataFrame(all_rows) |
|
|
|
|
|
if not df.empty and {"X", "Y"}.issubset(df.columns): |
|
df["X_decimal"] = df["X"].apply( |
|
lambda c: convert_coord_to_decimal(c, default_direction="N") |
|
) |
|
df["Y_decimal"] = df["Y"].apply( |
|
lambda c: convert_coord_to_decimal(c, default_direction="W") |
|
) |
|
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> None: |
|
st.set_page_config( |
|
page_title="F4NB Extractor to Excel", page_icon="📄", layout="wide" |
|
) |
|
|
|
st.title("📄 F4NB Extractor to Excel") |
|
st.markdown( |
|
"Convert F4NB Word documents into a tidy Excel / DataFrame containing site & sector information.\n" |
|
"Upload one or many F4NB `.docx` files and hit **Process**." |
|
) |
|
|
|
fnb_sample_file_path = "samples/FN4B.docx" |
|
|
|
|
|
st.download_button( |
|
label="Download FNB Sample File", |
|
data=open(fnb_sample_file_path, "rb").read(), |
|
file_name="fnb.docx", |
|
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
|
) |
|
|
|
st.subheader("Upload Files") |
|
uploaded_files = st.file_uploader( |
|
"Select one or more F4NB `.docx` files", |
|
type=["docx"], |
|
accept_multiple_files=True, |
|
) |
|
process_btn = st.button("Process", type="primary", disabled=not uploaded_files) |
|
|
|
if process_btn and uploaded_files: |
|
with st.spinner("Extracting information…"): |
|
df = process_files_to_dataframe(uploaded_files) |
|
|
|
if df.empty: |
|
st.warning( |
|
"No data extracted. Check that the files conform to the expected format." |
|
) |
|
return |
|
|
|
st.success( |
|
f"Processed {len(uploaded_files)} file(s) – extracted {len(df)} sector rows." |
|
) |
|
st.dataframe(df, use_container_width=True) |
|
|
|
st.markdown("---") |
|
|
|
buffer = io.BytesIO() |
|
with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer: |
|
df.to_excel(writer, index=False, sheet_name="Extract") |
|
st.download_button( |
|
label="💾 Download Excel", |
|
data=buffer.getvalue(), |
|
file_name="extracted_fnb.xlsx", |
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
|
on_click="ignore", |
|
type="primary", |
|
) |
|
|
|
st.markdown("---") |
|
|
|
|
|
if {"Y_decimal", "X_decimal"}.issubset(df.columns): |
|
geo_df = ( |
|
df[["Y_decimal", "X_decimal", "Site Name", "Code"]] |
|
.dropna() |
|
.rename(columns={"Y_decimal": "Longitude", "X_decimal": "Latitude"}) |
|
.assign( |
|
Size=lambda d: ( |
|
pd.to_numeric(d["Height"], errors="coerce").fillna(10) |
|
if "Height" in d.columns |
|
else 10 |
|
) |
|
) |
|
) |
|
if not geo_df.empty: |
|
st.subheader("🗺️ Site Locations") |
|
fig = px.scatter_map( |
|
geo_df, |
|
lat="Latitude", |
|
lon="Longitude", |
|
hover_name="Site Name", |
|
hover_data={"Code": True}, |
|
size="Size", |
|
size_max=10, |
|
zoom=6, |
|
height=500, |
|
) |
|
fig.update_layout( |
|
mapbox_style="open-street-map", |
|
margin={"r": 0, "t": 0, "l": 0, "b": 0}, |
|
) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|