from bs4 import BeautifulSoup import requests from nltk import word_tokenize, sent_tokenize from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity from gensim.models import KeyedVectors import numpy as np # Load the pre trained German Wiki Model (Load any other model depending on your use case) w2v_model = KeyedVectors.load_word2vec_format(r'C:\Users\YassineEuchi\Desktop\BachelorArbeitFinalCode\ProjectWord2VecDeutschWiki\german.model', binary=True) def preprocess_text(text): # Tokenize the text tokens = word_tokenize(text) # Remove stop words stop_words = set(stopwords.words('german')) filtered_tokens = [token for token in tokens if token.lower() not in stop_words] return filtered_tokens def identify_disputed_part(article_text, talk_text): threshold = 0.7 # Preprocess the talk page text talk_tokens = preprocess_text(talk_text) # Tokenize the article text into paragraphs paragraphs = article_text.split('\n\n') disputed_parts = [] for p in paragraphs: # Preprocess paragraph p_tokens = preprocess_text(p) # Represent each text as the average of the word vectors try: talk_vector = np.mean([w2v_model[word] for word in talk_tokens if word in w2v_model], axis=0) p_vector = np.mean([w2v_model[word] for word in p_tokens if word in w2v_model], axis=0) # Calculate cosine similarity between the text representations similarity = cosine_similarity(p_vector.reshape(1, -1), talk_vector.reshape(1, -1))[0][0] # Check if the similarity is above the threshold if similarity > threshold: disputed_parts.append((p, 'DISPUTED')) else: disputed_parts.append((p, 'NON-DISPUTED')) except: pass return disputed_parts def clean_text(text): # Remove all HTML tags clean_text = re.sub('<[^<]+?>', '', text) # Remove any non-alphanumeric characters clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', clean_text) # Remove extra whitespace clean_text = re.sub('\s+', ' ', clean_text).strip() return clean_text URL = "https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Widerspruch" page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') URL = "https://de.wikipedia.org/wiki/Kategorie:Wikipedia:Widerspruch" page = requests.get(URL) soup = BeautifulSoup(page.content, 'html.parser') # Open a file to write the disputed content to with open("umstrittene_inhalteModelne.txt", "w", encoding="utf-8") as f: article_divs = soup.find_all('div', class_='mw-category-group') for div in article_divs: article_links = div.find_all('a') for link in article_links: title = link.text article_url = "https://de.wikipedia.org" + link['href'] article_page = requests.get(article_url) article_soup = BeautifulSoup(article_page.content, 'html.parser') article_text = article_soup.find('div', id='mw-content-text').text talk_page_url = article_url.replace("/wiki/", "/wiki/Diskussion:") talk_page = requests.get(talk_page_url) talk_soup = BeautifulSoup(talk_page.content, 'html.parser') talk_text = talk_soup.find('div', id='mw-content-text').text print(f"Article: {title}") disputed_parts = identify_disputed_part(article_text, talk_text) if disputed_parts: for part, label in disputed_parts: # Write the title, disputed part, and label to the file f.write(f"Title: {title}\nDisputed Part: {part}\nLabel: {label}\n\n")
