In [1]:
from typing import List, Tuple
import pandas as pd
In [2]:
from pathlib import Path
import findspark
findspark.init(str(next(Path("/opt").glob("spark-3*"))))
# findspark.init("/opt/spark-2.3.0-bin-hadoop2.7")
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import (
IntegerType,
StringType,
StructType,
StructField,
ArrayType,
)
spark = (
SparkSession.builder.appName("PySpark_Str_Func").enableHiveSupport().getOrCreate()
)
In [3]:
df = spark.createDataFrame(
pd.DataFrame(
data=[([1, 2], "how", 1), ([2, 3], "are", 2), ([3, 4], "you", 3)],
columns=["col1", "col2", "col3"],
)
)
df.show()
Boolean Operators and Functions¶
Please refer to Boolean Operators and Functions for details.
Rounding Functions¶
Please refer to Rounding Functions in Spark for details.
String Functions¶
Please refer to String Functions in Spark for details.
Statistical Functions¶
Please refer to Statistical Functions in Spark for details.
Date Functions in Spark¶
Please refer to Date Functions in Spark for details.
Window Functions in Spark¶
Please refer to Window Functions in Spark for details.
Collection Functions¶
Please refer to Collection Functions for details.
between¶
In [7]:
df.filter(col("col2").between("hoa", "hox")).show()
In [8]:
df.filter(col("col3").between(2, 3)).show()
cast¶
In [12]:
df2 = df.select(col("col1"), col("col2"), col("col3").astype(StringType()))
df2.show()
In [13]:
df2.schema
Out[13]:
In [15]:
df3 = df2.select(col("col1"), col("col2"), col("col3").cast(IntegerType()))
df3.show()
In [16]:
df3.schema
Out[16]:
lit¶
In [4]:
x = lit(1)
In [5]:
type(x)
Out[5]:
hash¶
In [7]:
df.withColumn("hash_code", hash("col2")).show()
when¶
null
in when condition is considered as false.
In [1]:
import org.apache.spark.sql.functions._
val df = spark.read.json("../data/people.json")
df.show
Out[1]:
null
in when condition is considered as false
.
In [3]:
df.select(when($"age" > 20, 1).otherwise(0).alias("gt20")).show
In [5]:
df.select(when($"age" <= 20, 1).otherwise(0).alias("le20")).show
In [6]:
df.select(when($"age".isNull, 0).when($"age" > 20 , 100).otherwise(10).alias("age")).show
In [7]:
df.select(when($"age".isNull, 0).alias("age")).show
In [ ]: