Ben Chuanlong Du's Blog

It is never too late to learn.

Read CSV Files Using Polars in Rust

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

Tips and Traps

  1. LazyCsvReader is more limited compared to CsvReader. CsvReader support specifying schema while LazyCsvReader does not.

  2. An empty filed is parsed as null instead of an empty string by default. And there is no way to change this behavior at this time. Please refer to this issue for more discussions. Characters other than empty are NOT parsed as null by default. However, parsing special characters into null is supported via the API CsvReader::with_null_values.

In [2]:
:timing
:sccache 1
:dep polars = { version = "0.26.1", features = ["lazy", "parquet", "dtype-full"] }
Out[2]:
Timing: true
sccache: true
Out[2]:
Took 446722ms
In [3]:
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
Out[3]:
Took 389ms

CsvReader and DataFrame

In [8]:
let mut s = Schema::new();
s.with_column("column_1".into(), DataType::UInt8);
s.with_column("column_2".into(), DataType::UInt8);
s.with_column("column_3".into(), DataType::UInt8);
s.with_column("column_4".into(), DataType::UInt16);
s.with_column("column_5".into(), DataType::Utf8);
s
Out[8]:
Schema:
name: column_1, data type: UInt8
name: column_2, data type: UInt8
name: column_3, data type: UInt8
name: column_4, data type: UInt16
name: column_5, data type: Utf8
Out[8]:
Took 2375ms
In [9]:
let df = CsvReader::from_path("rank53_j0_j0.csv")?
            .has_header(false)
            .with_dtypes(Some(&s))
            .with_null_values(None)
            .finish()?;
df
Out[9]:
shape: (10, 5)
┌──────────┬──────────┬──────────┬──────────┬────────────────┐
│ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5       │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---            │
│ u8       ┆ u8       ┆ u8       ┆ u16      ┆ str            │
╞══════════╪══════════╪══════════╪══════════╪════════════════╡
│ 0        ┆ 1        ┆ 2        ┆ 0        ┆ 56229711839232 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 1        ┆ 57324928499712 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 2        ┆ 37744977903616 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 3        ┆ NA             │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...      ┆ ...      ┆ ...      ┆ ...      ┆ ...            │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 6        ┆ 38019855810560 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 7        ┆ null           │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 8        ┆ 38157294764032 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 9        ┆ 38226014240768 │
└──────────┴──────────┴──────────┴──────────┴────────────────┘
Out[9]:
Took 3488ms
In [10]:
df.filter(
    &df.column("column_5")?.equal("")?
)?
Out[10]:
shape: (0, 5)
┌──────────┬──────────┬──────────┬──────────┬──────────┐
│ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ u8       ┆ u8       ┆ u8       ┆ u16      ┆ str      │
╞══════════╪══════════╪══════════╪══════════╪══════════╡
└──────────┴──────────┴──────────┴──────────┴──────────┘
Out[10]:
Took 2505ms
In [11]:
df.filter(
    &df.column("column_5")?.equal("NA")?
)?
Out[11]:
shape: (1, 5)
┌──────────┬──────────┬──────────┬──────────┬──────────┐
│ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ u8       ┆ u8       ┆ u8       ┆ u16      ┆ str      │
╞══════════╪══════════╪══════════╪══════════╪══════════╡
│ 0        ┆ 1        ┆ 2        ┆ 3        ┆ NA       │
└──────────┴──────────┴──────────┴──────────┴──────────┘
Out[11]:
Took 2465ms
In [12]:
df.filter(
    &df.column("column_5")?.is_null()
)?
Out[12]:
shape: (1, 5)
┌──────────┬──────────┬──────────┬──────────┬──────────┐
│ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
│ u8       ┆ u8       ┆ u8       ┆ u16      ┆ str      │
╞══════════╪══════════╪══════════╪══════════╪══════════╡
│ 0        ┆ 1        ┆ 2        ┆ 7        ┆ null     │
└──────────┴──────────┴──────────┴──────────┴──────────┘
Out[12]:
Took 2479ms

LazyCsvReader and LazyFrame

In [20]:
let df: LazyFrame = LazyCsvReader::new("rank53_j0_j0.csv")
            .has_header(false)
            .with_null_values(None)
            .finish()?;
df.collect()?
Out[20]:
shape: (10, 5)
┌──────────┬──────────┬──────────┬──────────┬────────────────┐
│ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5       │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---            │
│ i64      ┆ i64      ┆ i64      ┆ i64      ┆ str            │
╞══════════╪══════════╪══════════╪══════════╪════════════════╡
│ 0        ┆ 1        ┆ 2        ┆ 0        ┆ 56229711839232 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 1        ┆ 57324928499712 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 2        ┆ 37744977903616 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 3        ┆ NA             │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...      ┆ ...      ┆ ...      ┆ ...      ┆ ...            │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 6        ┆ 38019855810560 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 7        ┆ null           │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 8        ┆ 38157294764032 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0        ┆ 1        ┆ 2        ┆ 9        ┆ 38226014240768 │
└──────────┴──────────┴──────────┴──────────┴────────────────┘
Out[20]:
Took 3285ms
In [ ]:

Comments