Ben Chuanlong Du's Blog

It is never too late to learn.

Hands on the Polars Crate in Rust

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

Tips and Traps

  1. Polars is a blazingly fast DataFrames library implemented in Rust using Apache Arrow as memory model. It supports multithreading and lazy computation.

  2. The Rust crate polars has many features . Be sure to include features which are required for your use cases. Below are some commonly useful features.

    • lazy: Turns on support of lazy computation (LazyFrame, col, Expr, etc.). Notice that col and Expr are only for LazyFrame.
    • parquet: Turns on support of Parquet format (read/write Parquet files).
  3. Polars CANNOT handle data larger than memory at this time (even though this might change in future).

  4. Polars does not provide a way to scan Parquet files row by row currently. You can use the Parquet crate to achive this.

In [2]:
:timing
:sccache 1
:dep polars = { version = "0.26.1", features = ["lazy", "parquet"] }
Out[2]:
sccache: true
In [6]:
use polars::df;
use polars::prelude::*;
use polars::datatypes::DataType;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;

DataFrame.head

In [10]:
frame.head(None)
Out[10]:
shape: (10, 4)
┌─────┬──────────────────┬──────────────────┬─────────────┐
│ id0 ┆ id1              ┆ id2              ┆ score_r4_it │
│ --- ┆ ---              ┆ ---              ┆ ---         │
│ u64 ┆ u64              ┆ u64              ┆ f64         │
╞═════╪══════════════════╪══════════════════╪═════════════╡
│ 2   ┆ 16796161         ┆ 4503599635760400 ┆ 1.07059     │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16  ┆ 8293             ┆ 3288727552       ┆ -0.085568   │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ 4503599660926465 ┆ 28684            ┆ 4.779815    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ 4503599627386947 ┆ 274911465616     ┆ 4.650999    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ...              ┆ ...              ┆ ...         │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ 33560616         ┆ 274894685952     ┆ 4.494734    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 32  ┆ 4503599629598992 ┆ 9007218582356096 ┆ 2.843227    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 128 ┆ 4503599644148034 ┆ 229780750336     ┆ 6.006584    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16  ┆ 4503599653715968 ┆ 2147746084       ┆ 1.758733    │
└─────┴──────────────────┴──────────────────┴─────────────┘

DataFrame.shape

In [9]:
frame.shape()
Out[9]:
(4002557, 4)

DataFrame.height

In [10]:
frame.height()
Out[10]:
4002557
In [11]:
frame.width()
Out[11]:
4

DataFrame.apply

In [27]:
fn as_u64(id: &Series) -> Series {
    id.cast(&DataType::UInt64).unwrap()
}
In [29]:
df.apply("id0", as_u64);
df.apply("id1", as_u64);
df.apply("id2", as_u64);
df
Out[29]:
shape: (4002557, 4)
┌──────┬──────────────────┬──────────────────┬─────────────┐
│ id0  ┆ id1              ┆ id2              ┆ score_r4_it │
│ ---  ┆ ---              ┆ ---              ┆ ---         │
│ u64  ┆ u64              ┆ u64              ┆ f64         │
╞══════╪══════════════════╪══════════════════╪═════════════╡
│ 2    ┆ 16796161         ┆ 4503599635760400 ┆ 1.0705      │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 16   ┆ 8293             ┆ 3288727552       ┆ -0.085568   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2    ┆ 4503599660926465 ┆ 28684            ┆ 4.779815    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4503599627386947 ┆ 274911465616     ┆ 4.650999    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ...  ┆ ...              ┆ ...              ┆ ...         │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 68727866378      ┆ 4224323673915392 ┆ 11.800134   │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4    ┆ 4199169          ┆ 1073875984       ┆ 0.594198    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 512  ┆ 1114250          ┆ 70377336209696   ┆ 7.129028    │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4096 ┆ 1057409          ┆ 4573976961483792 ┆ 18.1317     │
└──────┴──────────────────┴──────────────────┴─────────────┘
In [36]:
let f = File::create("j.parquet").expect("Unable to create file");
let mut bfw = BufWriter::new(f);
let pw = ParquetWriter::new(bfw).with_compression(ParquetCompression::Snappy);
In [38]:
pw.finish(&mut df);

Loop Through Rows

In [19]:
{
    let columns = df.get_columns();
    for i in 0..5 {
        print!("{i}: ");
        columns.iter().for_each(|s: &Series| {
            print!("{:?} ", s.get(i));
        });
        println!("");
    }
}
0: Int64(2) Int64(16796161) Int64(4503599635760400) Float64(1.0705899035734592) 
1: Int64(16) Int64(8293) Int64(3288727552) Float64(-0.08556843043513492) 
2: Int64(2) Int64(4503599660926465) Int64(28684) Float64(4.779815249979719) 
3: Int64(4) Int64(4503599627386947) Int64(274911465616) Float64(4.650999108662172) 
4: Int64(1) Int64(4194968) Int64(549822931264) Float64(0.5104124463171542) 
Out[19]:
()
In [ ]:

Comments