Untitled

mail@pastecode.io avatar
unknown
plain_text
a month ago
4.0 kB
1
Indexable
Never
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk import pos_tag, FreqDist
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from docx import Document

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

def load_blog_text():
    """
    Prompts the user to select a Word file containing multiple blogs separated by headers
    and reads the content for analysis.
    """
    Tk().withdraw()  # Prevents the Tkinter root window from appearing
    word_file_path = askopenfilename(title="Select Word File", filetypes=[("Word files", "*.docx")])

    document = Document(word_file_path)
    blogs = []
    current_blog = ""

    for para in document.paragraphs:
        if para.style.name.startswith('Heading'):  # Assuming headers use a Heading style
            if current_blog:
                blogs.append(current_blog.strip())
                current_blog = ""
            current_blog += para.text + "\n"  # Add the header to the blog
        else:
            current_blog += para.text + " "
    
    if current_blog:
        blogs.append(current_blog.strip())  # Add the last blog

    return blogs

def analyze_text(text):
    """
    Analyzes the given text for word frequency, sentence structure, and part-of-speech tags.
    Returns a dictionary containing the analysis results.
    """
    # Tokenization
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in word_tokens if word.lower() not in stop_words]

    # Word Frequency
    word_freq = Counter(filtered_words)

    # Sentence Structure Analysis
    avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sent_tokens) / len(sent_tokens)

    # POS Tagging
    pos_tags = pos_tag(filtered_words)
    pos_freq = FreqDist(tag for (word, tag) in pos_tags)

    # Report
    report = {
        "total_words": len(word_tokens),
        "total_sentences": len(sent_tokens),
        "average_sentence_length": avg_sentence_length,
        "most_common_words": word_freq.most_common(10),
        "most_common_pos_tags": pos_freq.most_common(10)
    }
    
    return report

def save_report(reports, output_path):
    """
    Saves the analysis report to a text file.
    """
    with open(output_path, 'w') as file:
        file.write("Blog Grammar Analysis Report\n")
        file.write("="*30 + "\n\n")
        
        for i, report in enumerate(reports):
            file.write(f"Blog {i + 1} Analysis\n")
            file.write("-"*30 + "\n")
            file.write(f"Total Words: {report['total_words']}\n")
            file.write(f"Total Sentences: {report['total_sentences']}\n")
            file.write(f"Average Sentence Length: {report['average_sentence_length']:.2f} words\n\n")
            
            file.write("Most Common Words:\n")
            for word, freq in report['most_common_words']:
                file.write(f"{word}: {freq}\n")
            
            file.write("\nMost Common POS Tags:\n")
            for tag, freq in report['most_common_pos_tags']:
                file.write(f"{tag}: {freq}\n")
            file.write("\n\n")

    print(f"Report saved to: {output_path}")

def main():
    # Load and analyze the blog text
    blogs = load_blog_text()
    reports = [analyze_text(blog) for blog in blogs]
    
    # Save the report
    Tk().withdraw()  # Prevents the Tkinter root window from appearing
    output_path = askopenfilename(title="Save Report As", defaultextension=".txt", filetypes=[("Text files", "*.txt")])
    save_report(reports, output_path)

if __name__ == "__main__":
    main()
Leave a Comment