Untitled
unknown
python
a month ago
4.0 kB
0
Indexable
Never
import os import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import stopwords from collections import Counter from nltk import pos_tag, FreqDist from tkinter import Tk from tkinter.filedialog import askopenfilename from docx import Document # Download necessary NLTK datasets nltk.download('punkt') nltk.download('stopwords') nltk.download('averaged_perceptron_tagger') def load_blog_text(): """ Prompts the user to select a Word file containing multiple blogs separated by headers and reads the content for analysis. """ Tk().withdraw() # Prevents the Tkinter root window from appearing word_file_path = askopenfilename(title="Select Word File", filetypes=[("Word files", "*.docx")]) document = Document(word_file_path) blogs = [] current_blog = "" for para in document.paragraphs: if para.style.name.startswith('Heading'): # Assuming headers use a Heading style if current_blog: blogs.append(current_blog.strip()) current_blog = "" current_blog += para.text + "\n" # Add the header to the blog else: current_blog += para.text + " " if current_blog: blogs.append(current_blog.strip()) # Add the last blog return blogs def analyze_text(text): """ Analyzes the given text for word frequency, sentence structure, and part-of-speech tags. Returns a dictionary containing the analysis results. """ # Tokenization word_tokens = word_tokenize(text) sent_tokens = sent_tokenize(text) # Removing stopwords stop_words = set(stopwords.words('english')) filtered_words = [word for word in word_tokens if word.lower() not in stop_words] # Word Frequency word_freq = Counter(filtered_words) # Sentence Structure Analysis avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sent_tokens) / len(sent_tokens) # POS Tagging pos_tags = pos_tag(filtered_words) pos_freq = FreqDist(tag for (word, tag) in pos_tags) # Report report = { "total_words": len(word_tokens), "total_sentences": len(sent_tokens), "average_sentence_length": avg_sentence_length, "most_common_words": word_freq.most_common(10), "most_common_pos_tags": pos_freq.most_common(10) } return report def save_report(reports, output_path): """ Saves the analysis report to a text file. """ with open(output_path, 'w') as file: file.write("Blog Grammar Analysis Report\n") file.write("="*30 + "\n\n") for i, report in enumerate(reports): file.write(f"Blog {i + 1} Analysis\n") file.write("-"*30 + "\n") file.write(f"Total Words: {report['total_words']}\n") file.write(f"Total Sentences: {report['total_sentences']}\n") file.write(f"Average Sentence Length: {report['average_sentence_length']:.2f} words\n\n") file.write("Most Common Words:\n") for word, freq in report['most_common_words']: file.write(f"{word}: {freq}\n") file.write("\nMost Common POS Tags:\n") for tag, freq in report['most_common_pos_tags']: file.write(f"{tag}: {freq}\n") file.write("\n\n") print(f"Report saved to: {output_path}") def main(): # Load and analyze the blog text blogs = load_blog_text() reports = [analyze_text(blog) for blog in blogs] # Save the report Tk().withdraw() # Prevents the Tkinter root window from appearing output_path = askopenfilename(title="Save Report As", defaultextension=".txt", filetypes=[("Text files", "*.txt")]) save_report(reports, output_path) if __name__ == "__main__": main()
Leave a Comment