Ben Chuanlong Du's Blog

It is never too late to learn.

Read Parquet Files Using Polars in Rust

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

In [ ]:
:timing
:sccache 1
:dep polars = { version = "0.26.1", features = ["lazy", "parquet"] }
In [ ]:
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
In [8]:
let mut frame = LazyFrame::scan_parquet(
        "part-000.parquet",
        ScanArgsParquet::default(),
    )?
    .collect()?;
frame
Out[8]:
shape: (4002557, 4)
┌──────┬──────────────────┬──────────────────┬─────────────┐
│ id0  ┆ id1              ┆ id2              ┆ score_r4_it │
│ ---  ┆ ---              ┆ ---              ┆ ---         │
│ u64  ┆ u64              ┆ u64              ┆ f64         │
╞══════╪══════════════════╪══════════════════╪═════════════╡
│ 2    ┆ 16796161         ┆ 4503599635760400 ┆ 1.07059     │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16   ┆ 8293             ┆ 3288727552       ┆ -0.085568   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2    ┆ 4503599660926465 ┆ 28684            ┆ 4.779815    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4503599627386947 ┆ 274911465616     ┆ 4.650999    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...  ┆ ...              ┆ ...              ┆ ...         │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 68727866378      ┆ 4224323673915392 ┆ 11.800134   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4199169          ┆ 1073875984       ┆ 0.594198    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 512  ┆ 1114250          ┆ 70377336209696   ┆ 7.129028    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4096 ┆ 1057409          ┆ 4573976961483792 ┆ 18.13174    │
└──────┴──────────────────┴──────────────────┴─────────────┘

Count the Total Number of Rows of All Parquet Files

In [46]:
LazyFrame::scan_parquet(
        "data/test/**/*.parquet",
        ScanArgsParquet::default(),
    ).unwrap().select(
        &[count().cast(DataType::UInt64).alias("n")]
    ).collect().unwrap()["n"].u64().unwrap().get(0).unwrap()
Out[46]:
382761717
Out[46]:
Took 6767ms
In [37]:
let frame = LazyFrame::scan_parquet(
        "data/test/**/*.parquet",
        ScanArgsParquet::default(),
    ).unwrap();
let df = frame.select(&[count().cast(DataType::UInt64).alias("n")]).collect().unwrap();
df
Out[37]:
shape: (1, 1)
┌───────────┐
│ n         │
│ ---       │
│ u64       │
╞═══════════╡
│ 382761717 │
└───────────┘
Out[37]:
Took 6477ms

Comments