Untitled
unknown
python
a year ago
4.0 kB
7
Indexable
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk import pos_tag, FreqDist
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from docx import Document
# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
def load_blog_text():
"""
Prompts the user to select a Word file containing multiple blogs separated by headers
and reads the content for analysis.
"""
Tk().withdraw() # Prevents the Tkinter root window from appearing
word_file_path = askopenfilename(title="Select Word File", filetypes=[("Word files", "*.docx")])
document = Document(word_file_path)
blogs = []
current_blog = ""
for para in document.paragraphs:
if para.style.name.startswith('Heading'): # Assuming headers use a Heading style
if current_blog:
blogs.append(current_blog.strip())
current_blog = ""
current_blog += para.text + "\n" # Add the header to the blog
else:
current_blog += para.text + " "
if current_blog:
blogs.append(current_blog.strip()) # Add the last blog
return blogs
def analyze_text(text):
"""
Analyzes the given text for word frequency, sentence structure, and part-of-speech tags.
Returns a dictionary containing the analysis results.
"""
# Tokenization
word_tokens = word_tokenize(text)
sent_tokens = sent_tokenize(text)
# Removing stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
# Word Frequency
word_freq = Counter(filtered_words)
# Sentence Structure Analysis
avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sent_tokens) / len(sent_tokens)
# POS Tagging
pos_tags = pos_tag(filtered_words)
pos_freq = FreqDist(tag for (word, tag) in pos_tags)
# Report
report = {
"total_words": len(word_tokens),
"total_sentences": len(sent_tokens),
"average_sentence_length": avg_sentence_length,
"most_common_words": word_freq.most_common(10),
"most_common_pos_tags": pos_freq.most_common(10)
}
return report
def save_report(reports, output_path):
"""
Saves the analysis report to a text file.
"""
with open(output_path, 'w') as file:
file.write("Blog Grammar Analysis Report\n")
file.write("="*30 + "\n\n")
for i, report in enumerate(reports):
file.write(f"Blog {i + 1} Analysis\n")
file.write("-"*30 + "\n")
file.write(f"Total Words: {report['total_words']}\n")
file.write(f"Total Sentences: {report['total_sentences']}\n")
file.write(f"Average Sentence Length: {report['average_sentence_length']:.2f} words\n\n")
file.write("Most Common Words:\n")
for word, freq in report['most_common_words']:
file.write(f"{word}: {freq}\n")
file.write("\nMost Common POS Tags:\n")
for tag, freq in report['most_common_pos_tags']:
file.write(f"{tag}: {freq}\n")
file.write("\n\n")
print(f"Report saved to: {output_path}")
def main():
# Load and analyze the blog text
blogs = load_blog_text()
reports = [analyze_text(blog) for blog in blogs]
# Save the report
Tk().withdraw() # Prevents the Tkinter root window from appearing
output_path = askopenfilename(title="Save Report As", defaultextension=".txt", filetypes=[("Text files", "*.txt")])
save_report(reports, output_path)
if __name__ == "__main__":
main()
Editor is loading...
Leave a Comment