
Python 的数据科学生态系统无与伦比。其核心是 Pandas——一个将混乱的真实世界数据转化为结构化洞察的库。
理解 DataFrame
DataFrame 是一种二维标签数据结构。可以把它想象成拥有超能力的电子表格。
import pandas as pd
import numpy as np
df_csv = pd.read_csv('sales.csv', parse_dates=['date'])
print(df.shape) # (行数, 列数)
print(df.dtypes) # 列类型
print(df.describe()) # 统计摘要
print(df.isnull().sum()) # 缺失值计数

处理缺失数据
# 删除关键列为空的行
df = df.dropna(subset=['user_id', 'timestamp'])
# 时间序列前向填充
df['temperature'] = df['temperature'].fillna(method='ffill')
# 偏态数据的中位数填充
df['price'] = df['price'].fillna(df['price'].median())
# 按类别感知填充
df['age'] = df.groupby('gender')['age'].transform(
lambda x: x.fillna(x.median())
)

GroupBy:聚合引擎
summary = df.groupby('product_category').agg(
total_revenue=('revenue', 'sum'),
avg_order_value=('revenue', 'mean'),
order_count=('order_id', 'nunique'),
customer_count=('customer_id', 'nunique')
).reset_index()
# 窗口函数
df['7d_rolling_avg'] = df.groupby('product_id')['daily_sales'].transform(
lambda x: x.rolling(window=7, min_periods=1).mean()
)
# 组内排名
df['rank_in_category'] = df.groupby('category')['sales'].rank(
method='dense', ascending=False
)

合并数据集
# 左连接
result = customers.merge(orders, on='customer_id', how='left')
# 多键连接
result = transactions.merge(exchange_rates, on=['currency', 'date'], how='left')
# 反连接:找出从未下单的客户
merged = customers.merge(orders[['customer_id']].drop_duplicates(),
on='customer_id', how='left', indicator=True)
never_ordered = merged[merged['_merge'] == 'left_only']
可复现管道
class DataPipeline:
def load(self, path: str) -> pd.DataFrame:
df = pd.read_parquet(path)
required = {'user_id', 'timestamp', 'event_type'}
missing = required - set(df.columns)
if missing:
raise ValueError(f"Missing columns: {missing}")
return df
def clean(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df = df.drop_duplicates(subset=['user_id', 'timestamp'])
df = df.dropna(subset=['user_id'])
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
return df
def run(self, path: str) -> pd.DataFrame:
return self.clean(self.load(path))
大规模性能
# Categorical 类型:字符串列内存减少 10 倍
df['country'] = df['country'].astype('category')
df['status'] = df['status'].astype('category')
# 分块处理大文件
results = []
for chunk in pd.read_csv('huge_file.csv', chunksize=100_000):
results.append(process_chunk(chunk))
final_df = pd.concat(results, ignore_index=True)
# 使用查询字符串快速过滤
df.query('revenue > 1000 and country == "US" and age >= 18')
掌握 Pandas 的关键在于知道何时使用哪种抽象。构建可测试的管道,并在优化之前始终进行性能分析。
→ 使用 JSON Viewer 工具验证你的数据转换。