Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
3.8 kB
0
Indexable
Never
from bs4 import BeautifulSoup
import requests
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import numpy as np

# Load the pre trained German Wiki Model (Load any other model depending on your use case)
w2v_model = KeyedVectors.load_word2vec_format(r'C:\Users\YassineEuchi\Desktop\BachelorArbeitFinalCode\ProjectWord2VecDeutschWiki\german.model', binary=True)


def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('german'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    return filtered_tokens


def identify_disputed_part(article_text, talk_text):
    threshold = 0.7
    # Preprocess the talk page text
    talk_tokens = preprocess_text(talk_text)
    # Tokenize the article text into paragraphs
    paragraphs = article_text.split('\n\n')
    disputed_parts = []
    for p in paragraphs:
        # Preprocess paragraph
        p_tokens = preprocess_text(p)

        # Represent each text as the average of the word vectors
        try:
            talk_vector = np.mean([w2v_model[word] for word in talk_tokens if word in w2v_model], axis=0)
            p_vector = np.mean([w2v_model[word] for word in p_tokens if word in w2v_model], axis=0)
            # Calculate cosine similarity between the text representations
            similarity = cosine_similarity(p_vector.reshape(1, -1), talk_vector.reshape(1, -1))[0][0]
            # Check if the similarity is above the threshold
            if similarity > threshold:
                disputed_parts.append((p, 'DISPUTED'))
            else:
                disputed_parts.append((p, 'NON-DISPUTED'))
        except:
            pass

    return disputed_parts


def clean_text(text):
    # Remove all HTML tags
    clean_text = re.sub('<[^<]+?>', '', text)

    # Remove any non-alphanumeric characters
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text)

    # Remove extra whitespace
    clean_text = re.sub('\s+', ' ', clean_text).strip()

    return clean_text


URL = "https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Widerspruch"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

URL = "https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Widerspruch"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

# Open a file to write the disputed content to
with open("umstrittene_inhalteModelne.txt", "w", encoding="utf-8") as f:
    article_divs = soup.find_all('div', class_='mw-category-group')
    for div in article_divs:
        article_links = div.find_all('a')
        for link in article_links:
            title = link.text
            article_url = "https://de.wikipedia.org" + link['href']
            article_page = requests.get(article_url)
            article_soup = BeautifulSoup(article_page.content, 'html.parser')
            article_text = article_soup.find('div', id='mw-content-text').text
            talk_page_url = article_url.replace("/wiki/", "/wiki/Diskussion:")
            talk_page = requests.get(talk_page_url)
            talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
            talk_text = talk_soup.find('div', id='mw-content-text').text
            print(f"Article: {title}")
            disputed_parts = identify_disputed_part(article_text, talk_text)
            if disputed_parts:
                for part, label in disputed_parts:
                    # Write the title, disputed part, and label to the file
                    f.write(f"Title: {title}\nDisputed Part: {part}\nLabel: {label}\n\n")