Ben Chuanlong Du's Blog

It is never too late to learn.

Hands on the Rust Crate Parquet

Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!

Comments

  1. Notice that a cell in a Parquet table has a type of Field which is an enum of types.
In [2]:
:timing
:sccache 1
:dep parquet = ">=14.0.0"
In [4]:
use std::fs::File;
use std::path::Path;
use parquet::file::reader::{FileReader, SerializedFileReader};

let file = File::open("./bench.parquet").unwrap();
let reader = SerializedFileReader::new(file).unwrap();
for row in reader.get_row_iter(None).unwrap() {
    for (idx, (name, field)) in row.get_column_iter().enumerate() {
        println!("column index: {}, column name: {}, column value: {}", idx, name, field);
    }
}
column index: 0, column name: id0, column value: 9007474132647936
column index: 1, column name: id1, column value: 7424
column index: 2, column name: id2, column value: 4362862139015168
column index: 3, column name: row0, column value: "3j Ah"
column index: 4, column name: row1, column value: "Ad Kd Qd Td"
column index: 5, column name: row2, column value: "As Ks Qs Js Ts"
column index: 6, column name: time_ana, column value: 0.514239655
column index: 7, column name: time_sim, column value: 0.513780129
column index: 8, column name: time_sim_unopt, column value: 2.6066436360000003
column index: 9, column name: time_it_1k, column value: 0.0911758424762985
column index: 10, column name: time_it, column value: 1.125201072
column index: 0, column name: id0, column value: 9007474132647936
column index: 1, column name: id1, column value: 6912
column index: 2, column name: id2, column value: 4362862139015168
column index: 3, column name: row0, column value: "3j Ah"
column index: 4, column name: row1, column value: "Ad Kd Jd Td"
column index: 5, column name: row2, column value: "As Ks Qs Js Ts"
column index: 6, column name: time_ana, column value: 0.521932472
column index: 7, column name: time_sim, column value: 0.516484461
column index: 8, column name: time_sim_unopt, column value: 2.604909176
column index: 9, column name: time_it_1k, column value: 0.0986344892634308
column index: 10, column name: time_it, column value: 1.217248232
column index: 0, column name: id0, column value: 9007474132647936
column index: 1, column name: id1, column value: 5888
column index: 2, column name: id2, column value: 4362862139015168
column index: 3, column name: row0, column value: "3j Ah"
column index: 4, column name: row1, column value: "Ad Qd Jd Td"
column index: 5, column name: row2, column value: "As Ks Qs Js Ts"
column index: 6, column name: time_ana, column value: 0.544197511
column index: 7, column name: time_sim, column value: 0.53159728
column index: 8, column name: time_sim_unopt, column value: 2.650837948
column index: 9, column name: time_it_1k, column value: 0.0983330353293898
column index: 10, column name: time_it, column value: 1.213527989
column index: 0, column name: id0, column value: 9007474132647936
column index: 1, column name: id1, column value: 3840
column index: 2, column name: id2, column value: 4362862139015168
column index: 3, column name: row0, column value: "3j Ah"
column index: 4, column name: row1, column value: "Kd Qd Jd Td"
column index: 5, column name: row2, column value: "As Ks Qs Js Ts"
column index: 6, column name: time_ana, column value: 0.612770978
column index: 7, column name: time_sim, column value: 0.589677791
column index: 8, column name: time_sim_unopt, column value: 2.699055747
column index: 9, column name: time_it_1k, column value: 0.0893710769791751
column index: 10, column name: time_it, column value: 1.102928461
column index: 0, column name: id0, column value: 9007203549708288
column index: 1, column name: id1, column value: 7680
column index: 2, column name: id2, column value: 4362862139015168
column index: 3, column name: row0, column value: "3j 8h"
column index: 4, column name: row1, column value: "Ad Kd Qd Jd"
column index: 5, column name: row2, column value: "As Ks Qs Js Ts"
column index: 6, column name: time_ana, column value: 0.541861286
column index: 7, column name: time_sim, column value: 0.559772314
column index: 8, column name: time_sim_unopt, column value: 2.870595327
column index: 9, column name: time_it_1k, column value: 0.1010760360586662
column index: 10, column name: time_it, column value: 1.247379361
Out[4]:
()

Parse a Parquet file without knowing the order of columns.

In [25]:
let mut play = PlayRounds::default();
let file = File::open("/workdir/ofcp_test_data/test_data_11_dedup/part-000.parquet").unwrap();
let reader = SerializedFileReader::new(file).unwrap();
for (idx_r, row) in reader.get_row_iter(None).unwrap().enumerate() {
    let it_col = row.get_column_iter();
    let mut id0 = 0u64;
    let mut id1 = 0u64;
    let mut id2 = 0u64;
    let mut score_it = 0f64;
    for (name, field) in row.get_column_iter() {
        if name == "id0" {
            match field {
                &Field::Long(id) => {
                    id0 = id as u64;
                },
                _ => panic!("Wrong type for id0!"),
            }
        } else if name == "id1" {
            match field {
                &Field::Long(id) => {
                    id1 = id as u64;
                },
                _ => panic!("Wrong type for id1!"),
            }
        } else if name == "id2" {
            match field {
                &Field::Long(id) => {
                    id2 = id as u64;
                },
                _ => panic!("Wrong type for id2!"),
            }
        } else if name == "score_r4_it" {
            match field {
                &Field::Double(s) => {
                    score_it = s;
                },
                _ => panic!("Wrong type for score!"),
            }
        }
    }
    play.set_ids(id0, id1, id2);
    let mut method = BruteForceMethod::Iteration;
    let s = play.score_r4(&from_id(ALL ^ id0 ^ id1 ^ id2).unwrap(), 2.0, &mut method);
    let delta = (score_it -s).abs();
    assert!(
        delta < 1E-8,
        "Round 4 score for the following case is not calculated correctly!
        id0: {id0}, id1: {id1}, id2: {id2}, score_it: {score_it}, score_ana: {s}, delta: {delta}
        ",
    )
}
Out[25]:
()

Things become much easier if know the exact order of columns.

In [9]:
let mut play = PlayRounds::default();
let file = File::open("/workdir/ofcp_test_data/clean/test_data_11_dedup/part-000.parquet").unwrap();
let reader = SerializedFileReader::new(file).unwrap();
for row in reader.get_row_iter(None).unwrap() {
    let mut it_col = row.get_column_iter();
    let id0 = match it_col.next().unwrap().1 {
        &Field::ULong(id) => id,
        _ => panic!("Wrong type for id0!"),
    };
    let id1 = match it_col.next().unwrap().1 {
        &Field::ULong(id) => id,
        _ => panic!("Wrong type for id1!"),
    };
    let id2 = match it_col.next().unwrap().1 {
        &Field::ULong(id) => id,
        _ => panic!("Wrong type for id2!"),
    };
    let score_it = match it_col.next().unwrap().1 {
        &Field::Double(s) => s,
        _ => panic!("Wrong type for score!"),
    };
    play.set_ids(id0, id1, id2);
    let mut method = BruteForceMethod::Iteration;
    let s = play.score_r4(&from_id(ALL ^ id0 ^ id1 ^ id2).unwrap(), 2.0, &mut method);
    let delta = (score_it -s).abs();
    assert!(
        delta < 1E-8,
        "Round 4 score for the following case is not calculated correctly!
        id0: {id0}, id1: {id1}, id2: {id2}, score_it: {score_it}, score_ana: {s}, delta: {delta}
        ",
    )
}
Out[9]:
()
In [ ]:

Comments