Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

Tips and Traps

  1. Polars is a blazingly fast DataFrames library implemented in Rust using Apache Arrow as memory model. It supports multithreading and lazy computation.

  2. The Rust crate polars has many features . Be sure to include features which are required for your use cases. Below are some commonly useful features.

    • lazy: Turns on support of lazy computation (LazyFrame, col, Expr, etc.). Notice that col and Expr are only for LazyFrame.

    • parquet: Turns on support of Parquet format (read/write Parquet files).

  3. Polars CANNOT handle data larger than memory at this time (even though this might change in future).

  4. Polars does not provide a way to scan Parquet files row by row currently. You can use the Parquet crate to achive this.

:timing
:sccache 1
:dep polars = { version = "0.26.1", features = ["lazy", "parquet"] }
sccache: true
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;

DataFrame.head

frame.head(None)
shape: (10, 4) ┌─────┬──────────────────┬──────────────────┬─────────────┐ │ id0 ┆ id1 ┆ id2 ┆ score_r4_it │ │ --- ┆ --- ┆ --- ┆ --- │ │ u64 ┆ u64 ┆ u64 ┆ f64 │ ╞═════╪══════════════════╪══════════════════╪═════════════╡ │ 2 ┆ 16796161 ┆ 4503599635760400 ┆ 1.07059 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 16 ┆ 8293 ┆ 3288727552 ┆ -0.085568 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ 4503599660926465 ┆ 28684 ┆ 4.779815 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4 ┆ 4503599627386947 ┆ 274911465616 ┆ 4.650999 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... ┆ ... │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ 33560616 ┆ 274894685952 ┆ 4.494734 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 32 ┆ 4503599629598992 ┆ 9007218582356096 ┆ 2.843227 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 128 ┆ 4503599644148034 ┆ 229780750336 ┆ 6.006584 │ ├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 16 ┆ 4503599653715968 ┆ 2147746084 ┆ 1.758733 │ └─────┴──────────────────┴──────────────────┴─────────────┘

DataFrame.shape

frame.shape()
(4002557, 4)

DataFrame.height

frame.height()
4002557
frame.width()
4

DataFrame.apply

fn as_u64(id: &Series) -> Series {
    id.cast(&DataType::UInt64).unwrap()
}
df.apply("id0", as_u64);
df.apply("id1", as_u64);
df.apply("id2", as_u64);
df
shape: (4002557, 4) ┌──────┬──────────────────┬──────────────────┬─────────────┐ │ id0 ┆ id1 ┆ id2 ┆ score_r4_it │ │ --- ┆ --- ┆ --- ┆ --- │ │ u64 ┆ u64 ┆ u64 ┆ f64 │ ╞══════╪══════════════════╪══════════════════╪═════════════╡ │ 2 ┆ 16796161 ┆ 4503599635760400 ┆ 1.0705 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 16 ┆ 8293 ┆ 3288727552 ┆ -0.085568 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ 4503599660926465 ┆ 28684 ┆ 4.779815 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4 ┆ 4503599627386947 ┆ 274911465616 ┆ 4.650999 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4 ┆ 68727866378 ┆ 4224323673915392 ┆ 11.800134 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4 ┆ 4199169 ┆ 1073875984 ┆ 0.594198 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 512 ┆ 1114250 ┆ 70377336209696 ┆ 7.129028 │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 4096 ┆ 1057409 ┆ 4573976961483792 ┆ 18.1317 │ └──────┴──────────────────┴──────────────────┴─────────────┘
let f = File::create("j.parquet").expect("Unable to create file");
let mut bfw = BufWriter::new(f);
let pw = ParquetWriter::new(bfw).with_compression(ParquetCompression::Snappy);
pw.finish(&mut df);

Loop Through Rows

{
    let columns = df.get_columns();
    for i in 0..5 {
        print!("{i}: ");
        columns.iter().for_each(|s: &Series| {
            print!("{:?} ", s.get(i));
        });
        println!("");
    }
}
0: Int64(2) Int64(16796161) Int64(4503599635760400) Float64(1.0705899035734592) 
1: Int64(16) Int64(8293) Int64(3288727552) Float64(-0.08556843043513492) 
2: Int64(2) Int64(4503599660926465) Int64(28684) Float64(4.779815249979719) 
3: Int64(4) Int64(4503599627386947) Int64(274911465616) Float64(4.650999108662172) 
4: Int64(1) Int64(4194968) Int64(549822931264) Float64(0.5104124463171542) 
()