from bs4 import BeautifulSoup
import requests
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
import numpy as np
# Load the pre trained German Wiki Model (Load any other model depending on your use case)
w2v_model = KeyedVectors.load_word2vec_format(r'C:\Users\YassineEuchi\Desktop\BachelorArbeitFinalCode\ProjectWord2VecDeutschWiki\german.model', binary=True)
def preprocess_text(text):
# Tokenize the text
tokens = word_tokenize(text)
# Remove stop words
stop_words = set(stopwords.words('german'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
return filtered_tokens
def identify_disputed_part(article_text, talk_text):
threshold = 0.7
# Preprocess the talk page text
talk_tokens = preprocess_text(talk_text)
# Tokenize the article text into paragraphs
paragraphs = article_text.split('\n\n')
disputed_parts = []
for p in paragraphs:
# Preprocess paragraph
p_tokens = preprocess_text(p)
# Represent each text as the average of the word vectors
try:
talk_vector = np.mean([w2v_model[word] for word in talk_tokens if word in w2v_model], axis=0)
p_vector = np.mean([w2v_model[word] for word in p_tokens if word in w2v_model], axis=0)
# Calculate cosine similarity between the text representations
similarity = cosine_similarity(p_vector.reshape(1, -1), talk_vector.reshape(1, -1))[0][0]
# Check if the similarity is above the threshold
if similarity > threshold:
disputed_parts.append((p, 'DISPUTED'))
else:
disputed_parts.append((p, 'NON-DISPUTED'))
except:
pass
return disputed_parts
def clean_text(text):
# Remove all HTML tags
clean_text = re.sub('<[^<]+?>', '', text)
# Remove any non-alphanumeric characters
clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text)
# Remove extra whitespace
clean_text = re.sub('\s+', ' ', clean_text).strip()
return clean_text
URL = "https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Widerspruch"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
URL = "https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Widerspruch"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
# Open a file to write the disputed content to
with open("umstrittene_inhalteModelne.txt", "w", encoding="utf-8") as f:
article_divs = soup.find_all('div', class_='mw-category-group')
for div in article_divs:
article_links = div.find_all('a')
for link in article_links:
title = link.text
article_url = "https://de.wikipedia.org" + link['href']
article_page = requests.get(article_url)
article_soup = BeautifulSoup(article_page.content, 'html.parser')
article_text = article_soup.find('div', id='mw-content-text').text
talk_page_url = article_url.replace("/wiki/", "/wiki/Diskussion:")
talk_page = requests.get(talk_page_url)
talk_soup = BeautifulSoup(talk_page.content, 'html.parser')
talk_text = talk_soup.find('div', id='mw-content-text').text
print(f"Article: {title}")
disputed_parts = identify_disputed_part(article_text, talk_text)
if disputed_parts:
for part, label in disputed_parts:
# Write the title, disputed part, and label to the file
f.write(f"Title: {title}\nDisputed Part: {part}\nLabel: {label}\n\n")