Operations
Creating a LazyFrame
import pyfloe as pf
ff = pf.LazyFrame([{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}])
ff = pf.LazyFrame([user1, user2, user3]) # objects with __dict__
ff = pf.read_csv("data.csv")
ff = pf.from_iter(my_generator())
Selecting, filtering, computing
import pyfloe as pf
ff.select("name", "age")
ff.drop("internal_id")
ff.filter(pf.col("amount") > 100)
ff.with_column("tax", pf.col("amount") * 0.2)
ff.with_columns(tax=pf.col("amount") * 0.2, q=pf.col("ts").dt.quarter())
Sorting
ff.sort("amount", ascending=False)
Joining
# Hash join (default) — materializes right side
orders.join(customers, on="customer_id", how="left")
# Sort-merge join — O(1) memory for pre-sorted inputs
orders.sort("id").join(customers.sort("id"), on="id", sorted=True)
Grouping and aggregation
import pyfloe as pf
# Hash aggregation (default)
ff.group_by("region").agg(pf.col("amount").sum().alias("total"))
# Sorted streaming aggregation — O(1) memory per group
ff.sort("region").group_by("region", sorted=True).agg(
pf.col("amount").sum().alias("total")
)
Other operations
ff.rename({"old": "new"})
ff.explode("tags")
ff.union(other_ff)
ff.apply(str, columns=["amount"])
ff.head(10)
ff[0] # first row as dict
ff[5:10] # slice → new LazyFrame
Query plan and optimizer
print(pipeline.explain())
# Project [order_id, name, amount]
# Filter [(col("segment") == 'Enterprise')]
# Filter [(col("region") == 'EU')]
# Join [inner] ['customer_id'] = ['customer_id']
# Scan [...] (6 rows)
# Scan [...] (4 rows)
print(pipeline.explain(optimized=True))
# Project [order_id, name, amount]
# Join [inner] ...
# Filter [(col("region") == 'EU')] ← pushed into left branch
# Scan [...]
# Filter [(col("segment") == 'Enterprise')] ← pushed into right branch
# Scan [...]