# From Pandas to Polars: A Paradigm Shift in DataFrame Processing

This notebook accompanies the blog post comparing Pandas and Polars. It contains all the code examples for you to run and experiment with.

## Concept 1: The Expression-Based Paradigm

In [None]:
import pandas as pd
import polars as pl

# Create identical DataFrames
data = {'name': ['Alice', 'Bob', 'Charlie'], 'score': [85, 92, 78]}
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)

In [None]:
# Pandas: Adding a new column - direct assignment
pd_df['score_doubled'] = pd_df['score'] * 2
pd_df['passed'] = pd_df['score'] >= 80
print("Pandas Result:")
print(pd_df)

In [None]:
# Polars: Adding new columns - expression-based approach
pl_df = pl_df.with_columns(
    (pl.col("score") * 2).alias("score_doubled"),
    (pl.col("score") >= 80).alias("passed")
)
print("Polars Result:")
print(pl_df)

## Concept 2: The Four Essential Contexts

In [None]:
import pandas as pd
import polars as pl

data = {
    'department': ['Sales', 'Sales', 'Engineering', 'Engineering'],
    'employee': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'salary': [50000, 60000, 75000, 80000]
}
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)

In [None]:
# Pandas Operations
# Select columns
result_select = pd_df[['department', 'salary']]

# Filter rows
result_filter = pd_df[pd_df['salary'] > 55000]

# Group and aggregate
result_agg = pd_df.groupby('department')['salary'].agg(['mean', 'max']).reset_index()

print("Pandas Aggregation:")
print(result_agg)

In [None]:
# Polars Operations
# Select columns
result_select = pl_df.select("department", "salary")

# Filter rows
result_filter = pl_df.filter(pl.col("salary") > 55000)

# Group and aggregate
result_agg = pl_df.group_by("department").agg(
    pl.col("salary").mean().alias("salary_mean"),
    pl.col("salary").max().alias("salary_max")
)

print("Polars Aggregation:")
print(result_agg)

## Concept 3: No More Index

In [None]:
import pandas as pd
import polars as pl

data = {'id': [101, 102, 103], 'value': [10, 20, 30]}
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)

In [None]:
# Pandas: Set index and access by index value
pd_df_indexed = pd_df.set_index('id')
result = pd_df_indexed.loc[102]  # Returns a Series
print("Pandas loc result:")
print(result)

In [None]:
# Polars: Direct filtering - no index needed
result = pl_df.filter(pl.col("id") == 102)  # Returns a DataFrame
print("Polars filter result:")
print(result)

## Concept 4: Strict Data Types

In [None]:
import pandas as pd
import polars as pl
import numpy as np

# Pandas with NaN
pd_df = pd.DataFrame({'values': [1, 2, np.nan, 4]})
print("Pandas dtypes:")
print(pd_df.dtypes)  # float64 - silently converted!

# Polars with null
pl_df = pl.DataFrame({'values': [1, 2, None, 4]})
print("\nPolars schema:")
print(pl_df.schema)  # {'values': Int64} - stays integer!

## Concept 6: Conditional Logic

In [None]:
import pandas as pd
import polars as pl
import numpy as np

data = {'score': [45, 65, 85, 92, 55]}
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)

In [None]:
# Pandas: Using np.where
pd_df['grade'] = np.where(pd_df['score'] >= 60, 'Pass', 'Fail')

# Using nested np.where for multiple conditions
pd_df['letter_grade'] = np.where(
    pd_df['score'] >= 90, 'A',
    np.where(pd_df['score'] >= 80, 'B',
    np.where(pd_df['score'] >= 70, 'C',
    np.where(pd_df['score'] >= 60, 'D', 'F'))))

print("Pandas Result:")
print(pd_df)

In [None]:
# Polars: Readable chained conditions
pl_df = pl_df.with_columns(
    pl.when(pl.col("score") >= 60)
      .then(pl.lit("Pass"))
      .otherwise(pl.lit("Fail"))
      .alias("grade"),
    
    pl.when(pl.col("score") >= 90).then(pl.lit("A"))
      .when(pl.col("score") >= 80).then(pl.lit("B"))
      .when(pl.col("score") >= 70).then(pl.lit("C"))
      .when(pl.col("score") >= 60).then(pl.lit("D"))
      .otherwise(pl.lit("F"))
      .alias("letter_grade")
)

print("Polars Result:")
print(pl_df)

## Concept 7: Window Functions

In [None]:
import pandas as pd
import polars as pl

data = {
    'department': ['Sales', 'Sales', 'Engineering', 'Engineering', 'Sales'],
    'employee': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'salary': [50000, 60000, 75000, 80000, 55000]
}
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)

In [None]:
# Pandas: Add department average salary as a new column
pd_df['dept_avg_salary'] = pd_df.groupby('department')['salary'].transform('mean')

# Calculate deviation from department mean
pd_df['salary_deviation'] = pd_df['salary'] - pd_df.groupby('department')['salary'].transform('mean')

print("Pandas Result:")
print(pd_df)

In [None]:
# Polars: Window functions with .over()
pl_df = pl_df.with_columns(
    pl.col("salary").mean().over("department").alias("dept_avg_salary"),
    (pl.col("salary") - pl.col("salary").mean().over("department")).alias("salary_deviation")
)

print("Polars Result:")
print(pl_df)

## Concept 8: Avoid apply()

In [None]:
import pandas as pd
import polars as pl

data = {'text': ['hello', 'world', 'polars'], 'value': [1, 2, 3]}
pd_df = pd.DataFrame(data)
pl_df = pl.DataFrame(data)

In [None]:
# Pandas: using apply
pd_df['text_upper'] = pd_df['text'].apply(str.upper)
pd_df['value_squared'] = pd_df['value'].apply(lambda x: x ** 2)

print("Pandas Result:")
print(pd_df)

In [None]:
# Polars: Preferred - Use native expression methods
pl_df = pl_df.with_columns(
    pl.col("text").str.to_uppercase().alias("text_upper"),
    (pl.col("value") ** 2).alias("value_squared")
)

print("Polars Result:")
print(pl_df)

## Getting Started

In [None]:
import polars as pl

# Your first Polars DataFrame
df = pl.DataFrame({
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["NYC", "LA", "Chicago"]
})

# Your first expression pipeline
result = df.with_columns(
    (pl.col("age") + 5).alias("age_in_5_years"),
    pl.col("city").str.to_uppercase().alias("city_upper")
).filter(
    pl.col("age") > 26
)

print(result)