Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

Tips and Traps

  1. LazyCsvReader is more limited compared to CsvReader. CsvReader support specifying schema while LazyCsvReader does not.

  2. An empty filed is parsed as null instead of an empty string by default. And there is no way to change this behavior at this time. Please refer to this issue for more discussions. Characters other than empty are NOT parsed as null by default. However, parsing special characters into null is supported via the API CsvReader::with_null_values.

:timing
:sccache 1
:dep polars = { version = "0.26.1", features = ["lazy", "parquet", "dtype-full"] }
Timing: true sccache: true
Loading...
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
Loading...

CsvReader and DataFrame

let mut s = Schema::new();
s.with_column("column_1".into(), DataType::UInt8);
s.with_column("column_2".into(), DataType::UInt8);
s.with_column("column_3".into(), DataType::UInt8);
s.with_column("column_4".into(), DataType::UInt16);
s.with_column("column_5".into(), DataType::Utf8);
s
Schema: name: column_1, data type: UInt8 name: column_2, data type: UInt8 name: column_3, data type: UInt8 name: column_4, data type: UInt16 name: column_5, data type: Utf8
Loading...
let df = CsvReader::from_path("rank53_j0_j0.csv")?
            .has_header(false)
            .with_dtypes(Some(&s))
            .with_null_values(None)
            .finish()?;
df
shape: (10, 5) ┌──────────┬──────────┬──────────┬──────────┬────────────────┐ │ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ u8 ┆ u8 ┆ u8 ┆ u16 ┆ str │ ╞══════════╪══════════╪══════════╪══════════╪════════════════╡ │ 0 ┆ 1 ┆ 2 ┆ 0 ┆ 56229711839232 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 1 ┆ 57324928499712 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 2 ┆ 37744977903616 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 3 ┆ NA │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 6 ┆ 38019855810560 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 7 ┆ null │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 8 ┆ 38157294764032 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 9 ┆ 38226014240768 │ └──────────┴──────────┴──────────┴──────────┴────────────────┘
Loading...
df.filter(
    &df.column("column_5")?.equal("")?
)?
shape: (0, 5) ┌──────────┬──────────┬──────────┬──────────┬──────────┐ │ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ u8 ┆ u8 ┆ u8 ┆ u16 ┆ str │ ╞══════════╪══════════╪══════════╪══════════╪══════════╡ └──────────┴──────────┴──────────┴──────────┴──────────┘
Loading...
df.filter(
    &df.column("column_5")?.equal("NA")?
)?
shape: (1, 5) ┌──────────┬──────────┬──────────┬──────────┬──────────┐ │ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ u8 ┆ u8 ┆ u8 ┆ u16 ┆ str │ ╞══════════╪══════════╪══════════╪══════════╪══════════╡ │ 0 ┆ 1 ┆ 2 ┆ 3 ┆ NA │ └──────────┴──────────┴──────────┴──────────┴──────────┘
Loading...
df.filter(
    &df.column("column_5")?.is_null()
)?
shape: (1, 5) ┌──────────┬──────────┬──────────┬──────────┬──────────┐ │ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ u8 ┆ u8 ┆ u8 ┆ u16 ┆ str │ ╞══════════╪══════════╪══════════╪══════════╪══════════╡ │ 0 ┆ 1 ┆ 2 ┆ 7 ┆ null │ └──────────┴──────────┴──────────┴──────────┴──────────┘
Loading...

LazyCsvReader and LazyFrame

let df: LazyFrame = LazyCsvReader::new("rank53_j0_j0.csv")
            .has_header(false)
            .with_null_values(None)
            .finish()?;
df.collect()?
shape: (10, 5) ┌──────────┬──────────┬──────────┬──────────┬────────────────┐ │ column_1 ┆ column_2 ┆ column_3 ┆ column_4 ┆ column_5 │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ str │ ╞══════════╪══════════╪══════════╪══════════╪════════════════╡ │ 0 ┆ 1 ┆ 2 ┆ 0 ┆ 56229711839232 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 1 ┆ 57324928499712 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 2 ┆ 37744977903616 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 3 ┆ NA │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 6 ┆ 38019855810560 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 7 ┆ null │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 8 ┆ 38157294764032 │ ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 0 ┆ 1 ┆ 2 ┆ 9 ┆ 38226014240768 │ └──────────┴──────────┴──────────┴──────────┴────────────────┘
Loading...