yield
text classification
Defining input and output
AllenNLP
one example:one instance:one or more Fields(each Field is a input or output data) TextField(in) or LabelField(out)
Field->tensor to model
Read data
trasfer raw data to instances that match input/output spec
Make datasetReader
逐行读取text,将每个line分成单个的token,在根据词汇表中对应的id组成tensor
Build model
get features from each input word
combine word-level features to document-level feature vector
classify vector into one label
Steps
[^batch_size:每次训练时的样本数]:
- 对inatances进行编码生成Token IDs(SingleIDTokenIndexr())
- 将token通过embedding函数,生成vector
- 对vector进行压缩,变成单个向量
- 生成label概率分布,对loss进行计算
Allenlp structure
- is a Pytorch Model
- forward() makes output be a dictionary
- output includes loss key to optimize the model
总流程
Text -> classifier -> Instances(TextField+LabelField) -> Model(Embedding Vector+Seq2Vec Encoding) -> lables
Text->Text->instance->TokenIds(vector)->tensor->vector
Training and prediction
读取电影数据集
class ClassificationTsvReader(DatasetReader):
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
max_tokens: int = None):
super().__init__(lazy)
self.tokenizer = tokenizer or WhitespaceTokenizer()
self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
self.max_tokens = max_tokens
def _read(self, file_path: str) -> Iterable[Instance]:
with open(file_path, 'r') as lines:
for line in lines:
text, sentiment = line.strip().split('\t')
tokens = self.tokenizer.tokenize(text)
if self.max_tokens:
tokens = tokens[:self.max_tokens]
text_field = TextField(tokens, self.token_indexers)
label_field = LabelField(sentiment)
fields = {'text': text_field, 'label': label_field}
yield Instance(fields)
dataset_reader = ClassificationTsvReader(max_tokens=64)
instances = dataset_reader.read("quick_start/data/movie_review/train.tsv")
for instance in instances[:10]:
print(instance)
Text->Instances,输出是从数据集中读取的前十个instance
Feeding instances to the model

class SimpleClassifier(Model):
def __init__(self,
vocab: Vocabulary,
embedder: TextFieldEmbedder,
encoder: Seq2VecEncoder):
super().__init__(vocab)
self.embedder = embedder
self.encoder = encoder
num_labels = vocab.get_vocab_size("labels")
self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
def forward(self,
text: Dict[str, torch.Tensor],
label: torch.Tensor) -> Dict[str, torch.Tensor]:
# Shape: (batch_size, num_tokens, embedding_dim)
embedded_text = self.embedder(text)
# Shape: (batch_size, num_tokens)
mask = util.get_text_field_mask(text)
# Shape: (batch_size, encoding_dim)
encoded_text = self.encoder(embedded_text, mask)
# Shape: (batch_size, num_labels)
logits = self.classifier(encoded_text)
# Shape: (batch_size, num_labels)
probs = torch.nn.functional.softmax(logits, dim=-1)
# Shape: (1,)
loss = torch.nn.functional.cross_entropy(logits, label)
return {'loss': loss, 'probs': probs}
def run_training_loop():
dataset_reader = ClassificationTsvReader(max_tokens=64)
print("Reading data")
instances = dataset_reader.read("quick_start/data/movie_review/train.tsv")
vocab = build_vocab(instances)
model = build_model(vocab)
outputs = model.forward_on_instances(instances[:4])
print(outputs)
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
print("Building the vocabulary")
return Vocabulary.from_instances(instances)
def build_model(vocab: Vocabulary) -> Model:
print("Building the model")
vocab_size = vocab.get_vocab_size("tokens")
embedder = BasicTextFieldEmbedder(
{"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
return SimpleClassifier(vocab, embedder, encoder)
run_training_loop()
self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
[^LInear(input_dim,output_dim):输入向量维度(batch_size,input_dim),输出向量维度(batch_size,output_dim)]:
torch.nn.functional.cross_entropy
![]()
nll_loss函数
![]()
运行结果:
[^???embedding,encoding返回的小数计算]: