Untitled
unknown
plain_text
a year ago
3.1 kB
0
Indexable
Never
import pandas as pd df=pd.read_excel("Automatic Ticket Assignment.xlsx") df.head() df.dropna(inplace=True) df['Assignment group'] import neattext.functions as nfx def preprocess_text(text): if not isinstance(text, str): return text # Apply various text cleaning functions using neattext text = nfx.remove_userhandles(text) text = nfx.remove_puncts(text) text = nfx.remove_numbers(text) text = nfx.remove_special_characters(text) text = nfx.remove_multiple_spaces(text) text = nfx.remove_html_tags(text) text = nfx.remove_dates(text) # Remove stopwords using a custom function text = nfx.remove_stopwords(text) return text for col in df.columns: if col != 'Assignment group': df[col] = df[col].apply(preprocess_text) print(df.head()) # Split the dataset into training and validation sets (you can adjust the test_size) from sklearn.model_selection import train_test_split train_df, val_df = train_test_split(df, test_size=0.2, random_state=42) from transformers import BertTokenizer, BertForSequenceClassification import torch # Assuming 'train_df' and 'val_df' are your DataFrames model_name = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(model_name) model = BertForSequenceClassification.from_pretrained(model_name, num_labels=74) def tokenize_data(data): input_text = data['Short description'] + ' ' + data['Description'] + ' ' + data['Caller'] + ' ' + data['Assignment group'] #print("Inut Text ",input_text) inputs = tokenizer( input_text.tolist(), # Convert to list padding=True, truncation=True, return_tensors="pt", return_attention_mask=True, return_token_type_ids=False, verbose=True ) labels = [group.split('_')[-1] for group in data['Assignment group']] inputs["labels"] = torch.tensor([int(label) for label in labels]) return inputs train_dataset = tokenize_data(train_df) val_dataset = tokenize_data(val_df) print("Train Dataset ",train_dataset) print("Val Dataset ",val_dataset) ## Working till here from transformers import BertForSequenceClassification, Trainer, TrainingArguments ## Training and rediction # Define the model model_name = 'bert-base-uncased' model = BertForSequenceClassification.from_pretrained(model_name, num_labels=74) # Define training arguments training_args = TrainingArguments( output_dir='./results', num_train_epochs=3, per_device_train_batch_size=8, evaluation_strategy="steps", logging_dir='./logs', logging_steps=10, do_train=True, do_eval=True, do_predict=True, load_best_model_at_end=True, metric_for_best_model="accuracy", greater_is_better=True, save_total_limit=3 ) # Define trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) # Train the model trainer.train() # Evaluate the model results = trainer.evaluate()