数据类型:csv格式

['content']: 文本评论

['score']: 星级(1-5)5星为最好

处理方法: 将1-5星级改成三级, 1-2星为1级 'negative', 3星-2级'neutral', 4-5星为3级 'positive' 。

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn


BATCH_SIZE = 16
MAX_LEN = 160

class_names = ['negative', 'neutral', 'positive']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

df = pd.read_csv('data/reviews.csv')


def to_sentiment(score):
    score = int(score)
    if score <= 2:
        return 1
    elif score == 3:
        return 2
    else:
        return 3


df['sentiment'] = df.score.apply(to_sentiment)
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=1)


class ReviewsDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.len = len(self.reviews)

    def __getitem__(self, item):
        review = self.reviews[item]
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids' : encoding['token_type_ids'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

    def __len__(self):
        return self.len


def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ReviewsDataset(
        reviews=df.content.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, shuffle=True)


train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

model = BertModel.from_pretrained('bert-base-cased')
model = model.to(device)

for data in train_data_loader:
    input_ids = data['input_ids'].to(device)
    attention_mask = data["attention_mask"].to(device)
    targets = data["targets"].to(device)
    token_type_ids = data['token_type_ids'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                    token_type_ids=token_type_ids, return_dict=True)
    print(outputs.keys())  # odict_keys(['last_hidden_state', 'pooler_output'])

09-09 06:31