>>> from transformers import pipeline
# 使用情绪分析流水线
>>> classifier = pipeline('sentiment-analysis')
>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
>>> from transformers import pipeline
# 使用问答流水线
>>> question_answerer = pipeline('question-answering')
>>> question_answerer({
... 'question': 'What is the name of the repository ?',
... 'context': 'Pipeline has been included in the huggingface/transformers repository'
... })
{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
from transformers import pipeline
classifier = pipeline("sentiment-analysis") # 情感分析
classifier("I've been waiting for a HuggingFace course my whole life.")
# 输出# [{'label': 'POSITIVE', 'score': 0.9598047137260437}]
也可以传几句话:
classifier(
["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)
# 输出
'''
[{'label': 'POSITIVE', 'score': 0.9598047137260437},
{'label': 'NEGATIVE', 'score': 0.9994558095932007}]
'''
raw_inputs = [
"I've been waiting for a HuggingFace course my whole life.",
"I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)
from transformers import BertConfig, BertModel
# Building the config
config = BertConfig()
# Building the model from the config
model = BertModel(config)
2)不同的加载方式
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-cased")
我们已经探索了分词器的工作原理,并研究了分词 tokenizers、转换为输入 ID conversion to input IDs、填充 padding、截断 truncation和注意力掩码 attention masks。Transformers API 可以通过高级函数为我们处理所有这些。
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence)
# 可以标记单个序列
sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence)
# 还可以一次处理多个序列
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
model_inputs = tokenizer(sequences)
# 可以根据几个目标进行填充# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")
# Will pad the sequences up to the model max length# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
# 还可以截断序列
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
# Will truncate the sequences that are longer than the model max length# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)
# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)
# 可以处理到特定框架张量的转换,然后可以将其直接发送到模型。
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
Special tokens
分词器在开头添加特殊词[CLS],在结尾添加特殊词[SEP]。
sequence = "I've been waiting for a HuggingFace course my whole life."
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
# 输出
'''
[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
'''
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))
# 输出
'''
"[CLS] i've been waiting for a huggingface course my whole life. [SEP]"
"i've been waiting for a huggingface course my whole life."
'''
# 总结:从分词器到模型
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
Local Propagation: 滑动窗口的方法(比如RBPN,TGA,EDVR)采用局部窗口内的多帧LR图像作为输入并进行中间帧的重建。这种设计方式约束了信息范围,进而影响了模型的性能。下图给出了不同信息范围下的模型性能对比,可以看到:(1)全局信息的利用具有更佳性能;(2) 片段的两端性能差异非常大,说明了长序列累积信息(即全局信息)的重要性。
去年11月,滑铁卢大学率先提出了 KaPao:Rethinking Keypoint Representations: Modeling Keypoints and Poses as Objects for Multi-Person Human Pose Estimation,基于YOLOv5进行关键点检测,该文章目前已被ECCV 2022接收,该算法所取得的性能如下: