您的位置:首页 > 篮球集锦 > NBA集锦 > 篮网集锦
2024年12月24日 NBA常规赛 篮网vs热火全场集锦
2024-12-24 12:23:57
请帮我检查一下这段代码,看是否存在逻辑错误:import numpy as np import torch import copy from transformers import BertTokenizer, BertModel import pandas as pd import time class Sentence: def __init__(self, sentence): self.sentence = sentence self.embeddings = None @staticmethod def get_embeddings(sentences, model, tokenizer): embeddings_list = [] for sent in sentences: input_ids = torch.tensor([tokenizer.encode(sent, add_special_tokens=True)])[0] with torch.no_grad(): bert_embedding = model(input_ids.unsqueeze(0))[1] # [CLS]的embedding embeddings_list.append(bert_embedding) return np.stack(embeddings_list) def get_bert_embeddings(self, model, tokenizer): self.embeddings = Sentence.get_embeddings([self.sentence], model, tokenizer).squeeze() @staticmethod def cos_similarity(x1, x2): return torch.sum(torch.mul(x1, x2)) / (torch.norm(x1) * torch.norm(x2)) def get_bert_embedding(self, model, tokenizer): self.get_bert_embeddings(model, tokenizer) def compare_sentences(self, other_sentence, model, tokenizer): sentence_1 = Sentence(self.sentence) sentence_2 = Sentence(other_sentence) sentence_1.get_bert_embeddings(model, tokenizer) sentence_2.get_bert_embeddings(model, tokenizer) return Sentence.cos_similarity(sentence_1.embeddings, sentence_2.embeddings).item() def __repr__(self): return self.sentence class EmbeddingCompare: def __init__(self, model_name='bert-base-uncased', data_path=None): if not data_path: raise ValueError('data_path is required') tokenizer = BertTokenizer.from_pretrained(model_name) bert_model = BertModel.from_pretrained(model_name) self.tokenizer = tokenizer self.bert_model = bert_model def load_data(self, data_path): df = pd.read_csv(data_path) return list(df['sentence']) def get_bert_embeddings_for_sentences(self, sentences): return Sentence.get_embeddings(sentences, self.bert_model, self.tokenizer) def compare_sentences(self, sentence1, sentence2): return Sentence.compare_sentences(sentence1, sentence2, self.bert_model, self.tokenizer) 这段代码中,`compare_sentences`方法中有两次调用了`get_bert_embeddings`和`cos_similarity`的方法。但在这两个地方的参数传递上存在一些问题。在`Sentence`类中的`compare_sentences`方法里,应该直接使用传入的`self`对象来获取嵌入向量而不是创建新的`Sentence`实例。 具体来说,将以下部分: ```python sentence_1 = Sentence(self.sentence) sentence_2 = Sentence(other_sentence) ``` 改为: ```python sentence_1 = self sentence_2 = other_sentence ``` 修正后的代码如下: ```python import numpy as np import torch import copy from transformers import BertTokenizer, BertModel import pandas as pd import time class Sentence: def __init__(self, sentence): self.sentence = sentence self.embeddings = None @staticmethod def get_embeddings(sentences, model, tokenizer): embeddings_list = [] for sent in sentences: input_ids = torch.tensor([tokenizer.encode(sent, add_special_tokens=True)])[0] with torch.no_grad(): bert_embedding = model(input_ids.unsqueeze(0))[1] # [CLS]的embedding embeddings_list.append(bert_embedding) return np.stack(embeddings_list) def get_bert_embeddings(self, model, tokenizer): self.embeddings = Sentence.get_embeddings([self.sentence], model, tokenizer).squeeze() @staticmethod def cos_similarity(x1, x2): return torch.sum(torch.mul(x1, x2)) / (torch.norm(x1) * torch.norm(x2)) def get_bert_embedding(self, model, tokenizer): self.get_bert_embeddings(model, tokenizer) def compare_sentences(self, other_sentence, model, tokenizer): sentence_1 = self sentence_2 = Sentence(other_sentence) sentence_1.get_bert_embeddings(model, tokenizer) sentence_2.get_bert_embeddings(model, tokenizer) return Sentence.cos_similarity(sentence_1.embeddings, sentence_2.embeddings).item() def __repr__(self): return self.sentence class EmbeddingCompare: def __init__(self, model_name='bert-base-uncased', data_path=None): if not data_path: raise ValueError('data_path is required') tokenizer = BertTokenizer.from_pretrained(model_name) bert_model = BertModel.from_pretrained(model_name) self.tokenizer = tokenizer self.bert_model = bert_model def load_data(self, data_path): df = pd.read_csv(data_path) return list(df['sentence']) def get_bert_embeddings_for_sentences(self, sentences): return Sentence.get_embeddings(sentences, self.bert_model, self.tokenizer) def compare_sentences(self, sentence1, sentence2): return Sentence.compare_sentences(sentence1, sentence2, self.bert_model, self.tokenizer) ``` 这样,`compare_sentences`方法就能够正确地获取两个句子的嵌入向量,并计算它们之间的余弦相似度。请注意,在调用静态方法时不需要传递类实例(即不需要使用`self`),但是当涉及到实例方法时,则需要传递适当的参数。修正后的方法确保了代码逻辑更加清晰和准确。希望这能解决你遇到的问题!如果还有其他问题,请继续提问。