2024年12月24日 NBA常规赛 篮网vs热火全场集锦
2024-12-24 12:23:57
请帮我检查一下这段代码,看是否存在逻辑错误:import numpy as np
import torch
import copy
from transformers import BertTokenizer, BertModel
import pandas as pd
import time
class Sentence:
def __init__(self, sentence):
self.sentence = sentence
self.embeddings = None
@staticmethod
def get_embeddings(sentences, model, tokenizer):
embeddings_list = []
for sent in sentences:
input_ids = torch.tensor([tokenizer.encode(sent, add_special_tokens=True)])[0]
with torch.no_grad():
bert_embedding = model(input_ids.unsqueeze(0))[1] # [CLS]的embedding
embeddings_list.append(bert_embedding)
return np.stack(embeddings_list)
def get_bert_embeddings(self, model, tokenizer):
self.embeddings = Sentence.get_embeddings([self.sentence], model, tokenizer).squeeze()
@staticmethod
def cos_similarity(x1, x2):
return torch.sum(torch.mul(x1, x2)) / (torch.norm(x1) * torch.norm(x2))
def get_bert_embedding(self, model, tokenizer):
self.get_bert_embeddings(model, tokenizer)
def compare_sentences(self, other_sentence, model, tokenizer):
sentence_1 = Sentence(self.sentence)
sentence_2 = Sentence(other_sentence)
sentence_1.get_bert_embeddings(model, tokenizer)
sentence_2.get_bert_embeddings(model, tokenizer)
return Sentence.cos_similarity(sentence_1.embeddings, sentence_2.embeddings).item()
def __repr__(self):
return self.sentence
class EmbeddingCompare:
def __init__(self, model_name='bert-base-uncased', data_path=None):
if not data_path:
raise ValueError('data_path is required')
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
self.tokenizer = tokenizer
self.bert_model = bert_model
def load_data(self, data_path):
df = pd.read_csv(data_path)
return list(df['sentence'])
def get_bert_embeddings_for_sentences(self, sentences):
return Sentence.get_embeddings(sentences, self.bert_model, self.tokenizer)
def compare_sentences(self, sentence1, sentence2):
return Sentence.compare_sentences(sentence1, sentence2, self.bert_model, self.tokenizer)
这段代码中,`compare_sentences`方法中有两次调用了`get_bert_embeddings`和`cos_similarity`的方法。但在这两个地方的参数传递上存在一些问题。在`Sentence`类中的`compare_sentences`方法里,应该直接使用传入的`self`对象来获取嵌入向量而不是创建新的`Sentence`实例。
具体来说,将以下部分:
```python
sentence_1 = Sentence(self.sentence)
sentence_2 = Sentence(other_sentence)
```
改为:
```python
sentence_1 = self
sentence_2 = other_sentence
```
修正后的代码如下:
```python
import numpy as np
import torch
import copy
from transformers import BertTokenizer, BertModel
import pandas as pd
import time
class Sentence:
def __init__(self, sentence):
self.sentence = sentence
self.embeddings = None
@staticmethod
def get_embeddings(sentences, model, tokenizer):
embeddings_list = []
for sent in sentences:
input_ids = torch.tensor([tokenizer.encode(sent, add_special_tokens=True)])[0]
with torch.no_grad():
bert_embedding = model(input_ids.unsqueeze(0))[1] # [CLS]的embedding
embeddings_list.append(bert_embedding)
return np.stack(embeddings_list)
def get_bert_embeddings(self, model, tokenizer):
self.embeddings = Sentence.get_embeddings([self.sentence], model, tokenizer).squeeze()
@staticmethod
def cos_similarity(x1, x2):
return torch.sum(torch.mul(x1, x2)) / (torch.norm(x1) * torch.norm(x2))
def get_bert_embedding(self, model, tokenizer):
self.get_bert_embeddings(model, tokenizer)
def compare_sentences(self, other_sentence, model, tokenizer):
sentence_1 = self
sentence_2 = Sentence(other_sentence)
sentence_1.get_bert_embeddings(model, tokenizer)
sentence_2.get_bert_embeddings(model, tokenizer)
return Sentence.cos_similarity(sentence_1.embeddings, sentence_2.embeddings).item()
def __repr__(self):
return self.sentence
class EmbeddingCompare:
def __init__(self, model_name='bert-base-uncased', data_path=None):
if not data_path:
raise ValueError('data_path is required')
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)
self.tokenizer = tokenizer
self.bert_model = bert_model
def load_data(self, data_path):
df = pd.read_csv(data_path)
return list(df['sentence'])
def get_bert_embeddings_for_sentences(self, sentences):
return Sentence.get_embeddings(sentences, self.bert_model, self.tokenizer)
def compare_sentences(self, sentence1, sentence2):
return Sentence.compare_sentences(sentence1, sentence2, self.bert_model, self.tokenizer)
```
这样,`compare_sentences`方法就能够正确地获取两个句子的嵌入向量,并计算它们之间的余弦相似度。请注意,在调用静态方法时不需要传递类实例(即不需要使用`self`),但是当涉及到实例方法时,则需要传递适当的参数。修正后的方法确保了代码逻辑更加清晰和准确。希望这能解决你遇到的问题!如果还有其他问题,请继续提问。