
数据预处理与特征工程
在大多数机器学习问题中,特征质量比算法选择更重要。
缺失值处理策略
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df = pd.read_csv("dataset.csv")
# 分析缺失情况
missing = df.isnull().sum()
missing_pct = missing / len(df) * 100
print(missing_pct[missing_pct > 0])
# 按类型选择策略
def impute_dataset(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
# 删除缺失率超过50%的列
df = df.dropna(thresh=len(df) * 0.5, axis=1)
# 数值型:中位数填充
num_cols = df.select_dtypes(include=[np.number]).columns
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])
# 类别型:众数填充
cat_cols = df.select_dtypes(include=["object"]).columns
cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
return df
# 高级:迭代填充(MICE)
mice_imputer = IterativeImputer(max_iter=10, random_state=42)
df_imputed = pd.DataFrame(mice_imputer.fit_transform(df), columns=df.columns)

编码分类变量
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import category_encoders as ce
# 目标编码(用于高基数特征)
target_encoder = ce.TargetEncoder(cols=["city", "product_category"])
X_encoded = target_encoder.fit_transform(X_train, y_train)
# 频率编码
def frequency_encode(df: pd.DataFrame, col: str) -> pd.Series:
freq_map = df[col].value_counts(normalize=True)
return df[col].map(freq_map)
# WOE编码(用于二分类)
woe_encoder = ce.WOEEncoder(cols=["customer_segment"])
X_woe = woe_encoder.fit_transform(X_train, y_train)
# 哈希编码(用于极高基数)
hash_encoder = ce.HashingEncoder(cols=["user_id"], n_components=16)
X_hash = hash_encoder.fit_transform(X_train)
特征缩放
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer
# 缩放前检查分布
import matplotlib.pyplot as plt
df[num_cols].hist(bins=30, figsize=(15, 10))
# 标准缩放(假设正态分布)
scaler = StandardScaler()
# 鲁棒缩放(针对异常值)
robust_scaler = RobustScaler()
# 对偏斜特征进行对数变换
from scipy.stats import skew
def auto_transform(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
df = df.copy()
for col in df.select_dtypes(include=[np.number]).columns:
if df[col].skew() > threshold:
df[col] = np.log1p(df[col].clip(lower=0))
return df
# PowerTransformer 用于高斯归一化
pt = PowerTransformer(method="yeo-johnson")
df_transformed = pd.DataFrame(pt.fit_transform(df[num_cols]), columns=num_cols)
自动化特征工程
import featuretools as ft
# 创建实体集
es = ft.EntitySet(id="customer_data")
es = es.add_dataframe(
dataframe=orders_df,
dataframe_name="orders",
index="order_id",
time_index="order_date",
)
es = es.add_dataframe(
dataframe=customers_df,
dataframe_name="customers",
index="customer_id",
)
es = es.add_relationship("customers", "customer_id", "orders", "customer_id")
# 自动生成特征
feature_matrix, feature_defs = ft.dfs(
entityset=es,
target_dataframe_name="customers",
agg_primitives=["count", "sum", "mean", "max", "min", "std"],
trans_primitives=["year", "month", "weekday"],
max_depth=2,
)
print(f"生成了 {len(feature_defs)} 个特征")
特征选择
from sklearn.feature_selection import (
SelectKBest, f_classif, mutual_info_classif,
RFECV, SelectFromModel,
)
from sklearn.ensemble import RandomForestClassifier
# 单变量选择
selector = SelectKBest(score_func=mutual_info_classif, k=20)
X_selected = selector.fit_transform(X_train, y_train)
selected_features = X_train.columns[selector.get_support()].tolist()
# 递归特征消除
rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=100), cv=5, scoring="roc_auc")
rfecv.fit(X_train, y_train)
important_features = X_train.columns[rfecv.support_].tolist()
# 基于SHAP的选择
import shap
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
mean_shap = np.abs(shap_values[1]).mean(axis=0)
feature_importance = pd.Series(mean_shap, index=X_train.columns).sort_values(ascending=False)
top_features = feature_importance.head(20).index.tolist()
流水线组合
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
num_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", RobustScaler()),
])
cat_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", ce.TargetEncoder()),
])
preprocessor = ColumnTransformer([
("num", num_pipeline, num_cols),
("cat", cat_pipeline, cat_cols),
])
full_pipeline = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=200)),
])
full_pipeline.fit(X_train, y_train)
predictions = full_pipeline.predict(X_test)
特征工程检查清单
| 技术 | 使用时机 |
|---|---|
| 对数变换 | 右偏数值 |
| 分箱 | 非线性关系 |
| 交互项 | 乘法效应 |
| 滞后特征 | 时间序列数据 |
| 滚动统计 | 时间模式 |
| 目标编码 | 高基数类别 |