Spaces:
Sleeping
Sleeping
refactor: Improve DuckDB Parquet tutorial notebook
Browse files- Add author attribution to notebook header
- Add sqlglot dependency for future SQL parsing capabilities
- Use consistent table references via variables instead of string literals
- Remove unused pyarrow import
- Improve markdown formatting for better readability
The notebook now properly references the created airbnb_stock table
through variables, making the code more maintainable and reducing
the risk of typos in table names.
- duckdb/008_loading_parquet.py +34 -34
duckdb/008_loading_parquet.py
CHANGED
|
@@ -5,6 +5,7 @@
|
|
| 5 |
# "duckdb==1.2.1",
|
| 6 |
# "pyarrow==19.0.1",
|
| 7 |
# "plotly.express",
|
|
|
|
| 8 |
# ]
|
| 9 |
# ///
|
| 10 |
|
|
@@ -16,7 +17,13 @@ app = marimo.App(width="medium")
|
|
| 16 |
|
| 17 |
@app.cell(hide_code=True)
|
| 18 |
def _(mo):
|
| 19 |
-
mo.md(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return
|
| 21 |
|
| 22 |
|
|
@@ -39,10 +46,11 @@ def _(mo):
|
|
| 39 |
)
|
| 40 |
return
|
| 41 |
|
|
|
|
| 42 |
@app.cell
|
| 43 |
def _():
|
| 44 |
AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
|
| 45 |
-
return AIRBNB_URL,
|
| 46 |
|
| 47 |
|
| 48 |
@app.cell(hide_code=True)
|
|
@@ -64,7 +72,7 @@ def _(mo):
|
|
| 64 |
|
| 65 |
|
| 66 |
@app.cell
|
| 67 |
-
def _(AIRBNB_URL, mo):
|
| 68 |
mo.sql(
|
| 69 |
f"""
|
| 70 |
SELECT *
|
|
@@ -86,8 +94,8 @@ def _(mo):
|
|
| 86 |
mo.md(
|
| 87 |
r"""
|
| 88 |
For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
|
| 89 |
-
|
| 90 |
Some useful options for `read_parquet` include:
|
|
|
|
| 91 |
- `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
|
| 92 |
- `filename=True`: Adds a `filename` column with the path of the file for each row.
|
| 93 |
- `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
|
|
@@ -148,23 +156,23 @@ def _(AIRBNB_URL, mo):
|
|
| 148 |
SELECT * FROM read_parquet('{AIRBNB_URL}');
|
| 149 |
"""
|
| 150 |
)
|
| 151 |
-
return stock_table
|
| 152 |
|
| 153 |
|
| 154 |
-
@app.cell
|
| 155 |
def _(mo, stock_table):
|
| 156 |
mo.md(
|
| 157 |
f"""
|
| 158 |
-
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
)
|
| 163 |
return
|
| 164 |
|
| 165 |
|
| 166 |
@app.cell
|
| 167 |
-
def _(mo):
|
| 168 |
mo.sql(
|
| 169 |
f"""
|
| 170 |
SELECT * FROM airbnb_stock LIMIT 5;
|
|
@@ -181,15 +189,12 @@ def _(mo):
|
|
| 181 |
|
| 182 |
@app.cell(hide_code=True)
|
| 183 |
def _(mo):
|
| 184 |
-
mo.md(
|
| 185 |
-
r"""
|
| 186 |
-
Let's perform a simple analysis: plotting the closing stock price over time.
|
| 187 |
-
"""
|
| 188 |
-
)
|
| 189 |
return
|
| 190 |
|
|
|
|
| 191 |
@app.cell
|
| 192 |
-
def _(mo):
|
| 193 |
stock_data = mo.sql(
|
| 194 |
f"""
|
| 195 |
SELECT
|
|
@@ -199,16 +204,12 @@ def _(mo):
|
|
| 199 |
ORDER BY "Date";
|
| 200 |
"""
|
| 201 |
)
|
| 202 |
-
return stock_data,
|
| 203 |
|
| 204 |
|
| 205 |
@app.cell(hide_code=True)
|
| 206 |
def _(mo):
|
| 207 |
-
mo.md(
|
| 208 |
-
r"""
|
| 209 |
-
Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.
|
| 210 |
-
"""
|
| 211 |
-
)
|
| 212 |
return
|
| 213 |
|
| 214 |
|
|
@@ -234,16 +235,16 @@ def _(mo):
|
|
| 234 |
def _(mo):
|
| 235 |
mo.md(
|
| 236 |
r"""
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
)
|
| 248 |
return
|
| 249 |
|
|
@@ -258,9 +259,8 @@ def _():
|
|
| 258 |
@app.cell
|
| 259 |
def _():
|
| 260 |
import pyarrow
|
| 261 |
-
return
|
| 262 |
|
| 263 |
|
| 264 |
if __name__ == "__main__":
|
| 265 |
app.run()
|
| 266 |
-
|
|
|
|
| 5 |
# "duckdb==1.2.1",
|
| 6 |
# "pyarrow==19.0.1",
|
| 7 |
# "plotly.express",
|
| 8 |
+
# "sqlglot==27.0.0",
|
| 9 |
# ]
|
| 10 |
# ///
|
| 11 |
|
|
|
|
| 17 |
|
| 18 |
@app.cell(hide_code=True)
|
| 19 |
def _(mo):
|
| 20 |
+
mo.md(
|
| 21 |
+
r"""
|
| 22 |
+
# Loading Parquet files with DuckDB
|
| 23 |
+
*By [Thomas Liang](https://github.com/thliang01)*
|
| 24 |
+
#
|
| 25 |
+
"""
|
| 26 |
+
)
|
| 27 |
return
|
| 28 |
|
| 29 |
|
|
|
|
| 46 |
)
|
| 47 |
return
|
| 48 |
|
| 49 |
+
|
| 50 |
@app.cell
|
| 51 |
def _():
|
| 52 |
AIRBNB_URL = 'https://huggingface.co/datasets/BatteRaquette58/airbnb-stock-price/resolve/main/data/airbnb-stock.parquet'
|
| 53 |
+
return (AIRBNB_URL,)
|
| 54 |
|
| 55 |
|
| 56 |
@app.cell(hide_code=True)
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
@app.cell
|
| 75 |
+
def _(AIRBNB_URL, mo, null):
|
| 76 |
mo.sql(
|
| 77 |
f"""
|
| 78 |
SELECT *
|
|
|
|
| 94 |
mo.md(
|
| 95 |
r"""
|
| 96 |
For more control, you can use the `read_parquet` table function. This is useful when you need to specify options, for example, when dealing with multiple files or specific data types.
|
|
|
|
| 97 |
Some useful options for `read_parquet` include:
|
| 98 |
+
|
| 99 |
- `binary_as_string=True`: Reads `BINARY` columns as `VARCHAR`.
|
| 100 |
- `filename=True`: Adds a `filename` column with the path of the file for each row.
|
| 101 |
- `hive_partitioning=True`: Enables reading of Hive-partitioned datasets.
|
|
|
|
| 156 |
SELECT * FROM read_parquet('{AIRBNB_URL}');
|
| 157 |
"""
|
| 158 |
)
|
| 159 |
+
return airbnb_stock, stock_table
|
| 160 |
|
| 161 |
|
| 162 |
+
@app.cell(hide_code=True)
|
| 163 |
def _(mo, stock_table):
|
| 164 |
mo.md(
|
| 165 |
f"""
|
| 166 |
+
{stock_table}
|
| 167 |
|
| 168 |
+
Now that the `airbnb_stock` table is created, we can query it like any other SQL table.
|
| 169 |
+
"""
|
| 170 |
)
|
| 171 |
return
|
| 172 |
|
| 173 |
|
| 174 |
@app.cell
|
| 175 |
+
def _(airbnb_stock, mo):
|
| 176 |
mo.sql(
|
| 177 |
f"""
|
| 178 |
SELECT * FROM airbnb_stock LIMIT 5;
|
|
|
|
| 189 |
|
| 190 |
@app.cell(hide_code=True)
|
| 191 |
def _(mo):
|
| 192 |
+
mo.md(r"""Let's perform a simple analysis: plotting the closing stock price over time.""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
return
|
| 194 |
|
| 195 |
+
|
| 196 |
@app.cell
|
| 197 |
+
def _(airbnb_stock, mo):
|
| 198 |
stock_data = mo.sql(
|
| 199 |
f"""
|
| 200 |
SELECT
|
|
|
|
| 204 |
ORDER BY "Date";
|
| 205 |
"""
|
| 206 |
)
|
| 207 |
+
return (stock_data,)
|
| 208 |
|
| 209 |
|
| 210 |
@app.cell(hide_code=True)
|
| 211 |
def _(mo):
|
| 212 |
+
mo.md(r"""Now we can easily visualize this result using marimo's integration with plotting libraries like Plotly.""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
return
|
| 214 |
|
| 215 |
|
|
|
|
| 235 |
def _(mo):
|
| 236 |
mo.md(
|
| 237 |
r"""
|
| 238 |
+
In this notebook, we've seen how easy it is to work with Parquet files in DuckDB. We learned how to:
|
| 239 |
+
<ul>
|
| 240 |
+
<li>Query Parquet files directly from a URL using a simple `FROM` clause.</li>
|
| 241 |
+
<li>Use the `read_parquet` function for more fine-grained control and efficiency.</li>
|
| 242 |
+
<li>Load data from a Parquet file into a DuckDB table.</li>
|
| 243 |
+
<li>Seamlessly analyze and visualize the data using SQL and Python.</li>
|
| 244 |
+
</ul>
|
| 245 |
+
|
| 246 |
+
DuckDB's native Parquet support makes it a powerful tool for interactive data analysis on large datasets without complex ETL pipelines.
|
| 247 |
+
"""
|
| 248 |
)
|
| 249 |
return
|
| 250 |
|
|
|
|
| 259 |
@app.cell
|
| 260 |
def _():
|
| 261 |
import pyarrow
|
| 262 |
+
return
|
| 263 |
|
| 264 |
|
| 265 |
if __name__ == "__main__":
|
| 266 |
app.run()
|
|
|