import json
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Function to read the SQuAD dataset
def read_squad(path):
    with open(path, 'r') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:

    return contexts, questions, answers

# Function to evaluate the model using F1 score and Exact Match
def evaluate_model(model, tokenizer, contexts, questions, answers):
    f1_scores = []
    exact_matches = []
    nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

    for context, question, answer in zip(contexts, questions, answers):
        model_answer = nlp({'question': question, 'context': context})['answer']
        f1_scores.append(compute_f1(answer, model_answer))
        exact_matches.append(compute_exact(answer, model_answer))

    return np.mean(f1_scores), np.mean(exact_matches)

# Function to compute the F1 score
def compute_f1(answer, model_answer):
    answer_tokens = answer.split()
    model_answer_tokens = model_answer.split()
    common = set(answer_tokens) & set(model_answer_tokens)

    if len(common) == 0:
        return 0

    precision = len(common) / len(model_answer_tokens)
    recall = len(common) / len(answer_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

# Function to compute the Exact Match score
def compute_exact(answer, model_answer):
    return int(answer == model_answer)

# Load the SQuAD dataset
contexts, questions, answers = read_squad('/kaggle/input/squad-dataset/squad.json')

# Load the tokenizer and model for Mistral
tokenizer_mistral = AutoTokenizer.from_pretrained('path/to/mistral')
model_mistral = AutoModelForQuestionAnswering.from_pretrained('path/to/mistral')

# Load the tokenizer and model for Gemma 7B
tokenizer_gemma = AutoTokenizer.from_pretrained('google/gemma-7b')
model_gemma = AutoModelForQuestionAnswering.from_pretrained('google/gemma-7b')

# Evaluate Mistral
f1_mistral, em_mistral = evaluate_model(model_mistral, tokenizer_mistral, contexts, questions, answers)

# Evaluate Gemma 7B
f1_gemma, em_gemma = evaluate_model(model_gemma, tokenizer_gemma, contexts, questions, answers)

# Print the results
print(f'Mistral F1 Score: {f1_mistral}')
print(f'Mistral Exact Match: {em_mistral}')
print(f'Gemma 7B F1 Score: {f1_gemma}')
print(f'Gemma 7B Exact Match: {em_gemma}')
