What is Pandas?
Pandas is Python's most popular library for data manipulation and analysis. It provides powerful data structures like DataFrame and Series that make working with structured data intuitive and efficient.
Think of Pandas as Excel for Python - but with much more power and flexibility for handling large datasets.
Creating DataFrames
import pandas as pd
# From dictionary
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['NYC', 'LA', 'Chicago']
})
# From CSV file
df = pd.read_csv('data.csv')
# From Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# From SQL
import sqlalchemy
engine = sqlalchemy.create_engine('postgresql://...')
df = pd.read_sql('SELECT * FROM orders', engine)
# Quick look at data
print(df.head()) # First 5 rows
print(df.tail()) # Last 5 rows
print(df.info()) # Column types, non-null counts
print(df.describe()) # Statistical summary
print(df.shape) # (rows, columns)
Selecting Data
# Select columns
df['name'] # Single column (Series)
df[['name', 'age']] # Multiple columns (DataFrame)
# Select rows by position
df.iloc[0] # First row
df.iloc[0:5] # First 5 rows
df.iloc[0:5, 0:2] # Rows 0-4, columns 0-1
# Select rows by label
df.loc[0] # Row with index 0
df.loc[0:4, 'name':'city'] # Rows 0-4, columns name to city
# Conditional selection
df[df['age'] > 25] # Age over 25
df[(df['age'] > 25) & (df['city'] == 'NYC')] # Multiple conditions
df[df['city'].isin(['NYC', 'LA'])] # City in list
df[df['name'].str.contains('li')] # Name contains 'li'
# Query method (SQL-like)
df.query('age > 25 and city == "NYC"')
Data Cleaning
# Handle missing values
df.isnull().sum() # Count nulls per column
df.dropna() # Drop rows with any null
df.dropna(subset=['name']) # Drop if 'name' is null
df.fillna(0) # Fill nulls with 0
df.fillna(df.mean()) # Fill with column mean
df['col'].fillna(method='ffill') # Forward fill
# Remove duplicates
df.duplicated().sum() # Count duplicates
df.drop_duplicates() # Remove duplicates
df.drop_duplicates(subset=['email']) # Based on column
# Data types
df['date'] = pd.to_datetime(df['date'])
df['amount'] = df['amount'].astype(float)
df['category'] = df['category'].astype('category')
# String operations
df['name'] = df['name'].str.lower()
df['name'] = df['name'].str.strip()
df['name'] = df['name'].str.replace('old', 'new')
# Rename columns
df.rename(columns={'old_name': 'new_name'}, inplace=True)
df.columns = df.columns.str.lower().str.replace(' ', '_')
Transformations
# Add new columns
df['total'] = df['quantity'] * df['price']
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
# Apply function to column
df['category'] = df['amount'].apply(
lambda x: 'High' if x > 100 else 'Low'
)
# Apply function to DataFrame
def process_row(row):
return row['qty'] * row['price'] * (1 - row['discount'])
df['net_amount'] = df.apply(process_row, axis=1)
# Map values
status_map = {'A': 'Active', 'I': 'Inactive'}
df['status'] = df['status_code'].map(status_map)
# Binning
df['age_group'] = pd.cut(df['age'],
bins=[0, 18, 35, 50, 100],
labels=['Youth', 'Young Adult', 'Middle Age', 'Senior']
)
# Pivot table
pivot = df.pivot_table(
values='amount',
index='region',
columns='month',
aggfunc='sum'
)
Aggregations
# Basic aggregations
df['amount'].sum()
df['amount'].mean()
df['amount'].median()
df['amount'].std()
df['customer_id'].nunique() # Unique count
# Group by
df.groupby('category')['amount'].sum()
df.groupby('category').agg({
'amount': 'sum',
'quantity': 'mean',
'order_id': 'count'
})
# Multiple aggregations
df.groupby('category').agg({
'amount': ['sum', 'mean', 'count'],
'quantity': ['sum', 'mean']
})
# Named aggregations (cleaner output)
df.groupby('category').agg(
total_sales=('amount', 'sum'),
avg_order=('amount', 'mean'),
order_count=('order_id', 'count')
)
# Reset index after groupby
result = df.groupby('category')['amount'].sum().reset_index()
Merging and Joining
# Merge (SQL-style joins)
merged = pd.merge(
orders,
customers,
on='customer_id',
how='left' # 'inner', 'outer', 'right'
)
# Merge on different column names
merged = pd.merge(
orders,
customers,
left_on='cust_id',
right_on='customer_id'
)
# Concatenate (stack vertically)
combined = pd.concat([df1, df2], ignore_index=True)
# Join on index
df1.join(df2, how='left')
Date Operations
# Convert to datetime
df['date'] = pd.to_datetime(df['date'])
# Extract components
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.day_name()
df['quarter'] = df['date'].dt.quarter
# Date arithmetic
df['days_since'] = (pd.Timestamp.now() - df['date']).dt.days
# Resample time series
monthly = df.set_index('date').resample('M')['amount'].sum()
weekly = df.set_index('date').resample('W')['amount'].mean()
# Rolling calculations
df['rolling_avg'] = df['amount'].rolling(window=7).mean()
df['cumsum'] = df['amount'].cumsum()
Exporting Data
# To CSV
df.to_csv('output.csv', index=False)
# To Excel
df.to_excel('output.xlsx', sheet_name='Data', index=False)
# Multiple sheets
with pd.ExcelWriter('report.xlsx') as writer:
df1.to_excel(writer, sheet_name='Summary')
df2.to_excel(writer, sheet_name='Details')
# To SQL
df.to_sql('table_name', engine, if_exists='replace')
# To JSON
df.to_json('output.json', orient='records')
Best Practices
- Use vectorized operations: Avoid loops, use built-in methods
- Chain methods: df.dropna().groupby('cat').sum()
- Use appropriate dtypes: category for low-cardinality strings
- Copy when needed: df_copy = df.copy() to avoid warnings
- Use query for readability: df.query('col > 5')
- Chunk large files: pd.read_csv(file, chunksize=10000)
Master Pandas with Expert Mentorship
Our Data Analytics program covers Pandas from basics to advanced analysis. Manipulate data like a pro with guidance from industry experts.
Explore Data Analytics Program