正在加载,请稍候…

使用Transformers进行NLP:文本分类与命名实体识别

掌握使用Hugging Face Transformers进行NLP任务。构建生产级文本分类器、命名实体识别系统和序列标注模型,基于BERT和现代大语言模型。

使用Transformers进行NLP:文本分类与命名实体识别

使用Transformers进行NLP:生产指南

使用BERT进行文本分类

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, load_metric
import torch
import numpy as np

MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 3  # 例如:负面/中性/正面

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS
)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

# 准备数据集
train_ds = Dataset.from_list([
    {"text": "Great product!", "label": 2},
    {"text": "Terrible quality.", "label": 0},
    {"text": "It is okay.", "label": 1},
])

tokenized_ds = train_ds.map(tokenize_function, batched=True)

# 训练
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    bf16=True,
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    eval_dataset=tokenized_ds,
    compute_metrics=compute_metrics,
)
trainer.train()

使用Transformers进行NLP:文本分类与命名实体识别插图

命名实体识别(NER)

from transformers import AutoModelForTokenClassification, pipeline

NER_LABELS = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
id2label = {i: l for i, l in enumerate(NER_LABELS)}
label2id = {l: i for i, l in enumerate(NER_LABELS)}

ner_model = AutoModelForTokenClassification.from_pretrained(
    "dbmdz/bert-large-cased-finetuned-conll03-english",
    id2label=id2label, label2id=label2id,
)

ner_pipeline = pipeline(
    "ner", model=ner_model, tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 if torch.cuda.is_available() else -1,
)

text = "Elon Musk founded Tesla in Palo Alto, California."
entities = ner_pipeline(text)
for e in entities:
    print(f"{e['word']}: {e['entity_group']} ({e['score']:.3f})")
# Elon Musk: PER (0.999)
# Tesla: ORG (0.997)
# Palo Alto: LOC (0.993)
# California: LOC (0.991)

使用Transformers进行NLP:文本分类与命名实体识别插图

零样本分类

from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify_text(text: str, candidate_labels: list[str]) -> dict:
    result = classifier(
        text,
        candidate_labels=candidate_labels,
        multi_label=False,
    )
    return dict(zip(result["labels"], result["scores"]))

categories = ["technology", "sports", "politics", "entertainment", "business"]
text = "The new iPhone 16 features advanced AI chips and improved camera system."
scores = classify_text(text, categories)
top = max(scores, key=scores.get)
print(f"Category: {top} ({scores[top]:.3f})")
# Category: technology (0.987)

使用Transformers进行NLP:文本分类与命名实体识别插图

用于语义任务的文本嵌入

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

def semantic_similarity(text1: str, text2: str) -> float:
    emb1 = model.encode(text1, convert_to_tensor=True)
    emb2 = model.encode(text2, convert_to_tensor=True)
    return float(util.cos_sim(emb1, emb2))

def find_most_similar(query: str, documents: list[str], top_k: int = 5) -> list:
    query_emb = model.encode(query, convert_to_tensor=True)
    doc_embs = model.encode(documents, convert_to_tensor=True)
    scores = util.cos_sim(query_emb, doc_embs)[0]
    top_indices = scores.topk(k=top_k).indices.tolist()
    return [(documents[i], float(scores[i])) for i in top_indices]

生产级NLP API

from fastapi import FastAPI
from pydantic import BaseModel
from functools import lru_cache

app = FastAPI()

class TextRequest(BaseModel):
    text: str

@lru_cache(maxsize=1)
def get_classifier():
    return pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

@lru_cache(maxsize=1)
def get_ner():
    return pipeline("ner", aggregation_strategy="simple")

@app.post("/classify")
async def classify(req: TextRequest):
    clf = get_classifier()
    result = clf(req.text)[0]
    return {"label": result["label"], "confidence": result["score"]}

@app.post("/ner")
async def extract_entities(req: TextRequest):
    ner = get_ner()
    return {"entities": ner(req.text)}

@app.post("/similarity")
async def compute_similarity(text1: str, text2: str):
    return {"similarity": semantic_similarity(text1, text2)}

推理模型优化

from transformers import pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification

# 导出为ONNX以加速CPU推理
ort_model = ORTModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english",
    export=True,
)

optimized_classifier = pipeline(
    "text-classification",
    model=ort_model,
    tokenizer=tokenizer,
)

# 在CPU上使用ONNX Runtime加速约3倍
result = optimized_classifier("This is fantastic!")