
使用Transformers进行NLP:生产指南
使用BERT进行文本分类
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, load_metric
import torch
import numpy as np
MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 3 # 例如:负面/中性/正面
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME, num_labels=NUM_LABELS
)
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512,
)
# 准备数据集
train_ds = Dataset.from_list([
{"text": "Great product!", "label": 2},
{"text": "Terrible quality.", "label": 0},
{"text": "It is okay.", "label": 1},
])
tokenized_ds = train_ds.map(tokenize_function, batched=True)
# 训练
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
warmup_steps=100,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
bf16=True,
)
def compute_metrics(eval_pred):
from sklearn.metrics import accuracy_score, f1_score
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average="weighted"),
}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_ds,
eval_dataset=tokenized_ds,
compute_metrics=compute_metrics,
)
trainer.train()

命名实体识别(NER)
from transformers import AutoModelForTokenClassification, pipeline
NER_LABELS = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
id2label = {i: l for i, l in enumerate(NER_LABELS)}
label2id = {l: i for i, l in enumerate(NER_LABELS)}
ner_model = AutoModelForTokenClassification.from_pretrained(
"dbmdz/bert-large-cased-finetuned-conll03-english",
id2label=id2label, label2id=label2id,
)
ner_pipeline = pipeline(
"ner", model=ner_model, tokenizer=tokenizer,
aggregation_strategy="simple",
device=0 if torch.cuda.is_available() else -1,
)
text = "Elon Musk founded Tesla in Palo Alto, California."
entities = ner_pipeline(text)
for e in entities:
print(f"{e['word']}: {e['entity_group']} ({e['score']:.3f})")
# Elon Musk: PER (0.999)
# Tesla: ORG (0.997)
# Palo Alto: LOC (0.993)
# California: LOC (0.991)

零样本分类
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
def classify_text(text: str, candidate_labels: list[str]) -> dict:
result = classifier(
text,
candidate_labels=candidate_labels,
multi_label=False,
)
return dict(zip(result["labels"], result["scores"]))
categories = ["technology", "sports", "politics", "entertainment", "business"]
text = "The new iPhone 16 features advanced AI chips and improved camera system."
scores = classify_text(text, categories)
top = max(scores, key=scores.get)
print(f"Category: {top} ({scores[top]:.3f})")
# Category: technology (0.987)

用于语义任务的文本嵌入
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
def semantic_similarity(text1: str, text2: str) -> float:
emb1 = model.encode(text1, convert_to_tensor=True)
emb2 = model.encode(text2, convert_to_tensor=True)
return float(util.cos_sim(emb1, emb2))
def find_most_similar(query: str, documents: list[str], top_k: int = 5) -> list:
query_emb = model.encode(query, convert_to_tensor=True)
doc_embs = model.encode(documents, convert_to_tensor=True)
scores = util.cos_sim(query_emb, doc_embs)[0]
top_indices = scores.topk(k=top_k).indices.tolist()
return [(documents[i], float(scores[i])) for i in top_indices]
生产级NLP API
from fastapi import FastAPI
from pydantic import BaseModel
from functools import lru_cache
app = FastAPI()
class TextRequest(BaseModel):
text: str
@lru_cache(maxsize=1)
def get_classifier():
return pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
@lru_cache(maxsize=1)
def get_ner():
return pipeline("ner", aggregation_strategy="simple")
@app.post("/classify")
async def classify(req: TextRequest):
clf = get_classifier()
result = clf(req.text)[0]
return {"label": result["label"], "confidence": result["score"]}
@app.post("/ner")
async def extract_entities(req: TextRequest):
ner = get_ner()
return {"entities": ner(req.text)}
@app.post("/similarity")
async def compute_similarity(text1: str, text2: str):
return {"similarity": semantic_similarity(text1, text2)}
推理模型优化
from transformers import pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification
# 导出为ONNX以加速CPU推理
ort_model = ORTModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased-finetuned-sst-2-english",
export=True,
)
optimized_classifier = pipeline(
"text-classification",
model=ort_model,
tokenizer=tokenizer,
)
# 在CPU上使用ONNX Runtime加速约3倍
result = optimized_classifier("This is fantastic!")