..

mail@pastecode.io avatar
unknown
plain_text
18 days ago
5.7 kB
3
Indexable
Never
mport java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;

public class TFIDFCalculator {

    public static final String WHITESPACE = " ";
    public static final String LETTERS_ONLY = "[^a-zA-Z]";
    public static final String EXTRA_SPACES = "\\s+";

    public static void main(String[] args) throws IOException {
        if (args.length != 2) {
            System.out.println("Usage: java TFIDFCalculator docs.txt tc<No.>.txt");
            return;
        }

        String docsFilePath = args[0];
        String tcFilePath = args[1];
        String outputFilePath = "output.txt";

        List<String> documents = splitDocuments(docsFilePath);
        List<TestCase> testCases = readTestCases(tcFilePath);

        Map<String, Map<Integer, Double>> tfidfScores = calculateTFIDFScores(documents, testCases);

        saveTFIDFScores(tfidfScores, testCases, outputFilePath);
    }

    private static List<String> splitDocuments(String docsFilePath) throws IOException {
        List<String> documents = new ArrayList<>();
        StringBuilder currentDocument = new StringBuilder();
        try (BufferedReader reader = Files.newBufferedReader(Paths.get(docsFilePath))) {
            String line;
            int lineCounter = 0;
            while ((line = reader.readLine()) != null) {
                line = line.replaceAll(LETTERS_ONLY, WHITESPACE)
                        .replaceAll(EXTRA_SPACES, WHITESPACE)
                        .toLowerCase().trim();
                currentDocument.append(WHITESPACE).append(line);
                lineCounter++;
                if (lineCounter % 5 == 0) {
                    documents.add(currentDocument.toString().trim());
                    currentDocument.setLength(0);
                }
            }
            if (!currentDocument.isEmpty()) {
                documents.add(currentDocument.toString().trim());
            }
        }
        return documents;
    }

    private static List<TestCase> readTestCases(String tcFilePath) throws IOException {
        List<String> lines = Files.readAllLines(Paths.get(tcFilePath));
        if (lines.size() < 2) {
            throw new IllegalArgumentException("tc_file must contain at least two lines");
        }
        List<String> terms = Arrays.asList(lines.get(0).split(WHITESPACE));
        List<String> docIds = Arrays.asList(lines.get(1).split(WHITESPACE));
        int maxLength = Math.min(terms.size(), docIds.size());
        List<TestCase> testCases = new ArrayList<>(maxLength);
        for (int i = 0; i < maxLength; i++) {
            testCases.add(new TestCase(terms.get(i), Integer.parseInt(docIds.get(i))));
        }
        return testCases;
    }

    private static Map<String, Map<Integer, Double>> calculateTFIDFScores(List<String> documents, List<TestCase> testCases) {
        Map<String, Map<Integer, Double>> tfidfScores = new ConcurrentHashMap<>();

        Map<String, Long> docFrequencyMap = new HashMap<>();
        documents.parallelStream().forEach(doc -> {
            Set<String> uniqueTerms = Arrays.stream(doc.split(WHITESPACE)).collect(Collectors.toSet());
            synchronized (docFrequencyMap) {
                uniqueTerms.forEach(term -> docFrequencyMap.put(term, docFrequencyMap.getOrDefault(term, 0L) + 1));
            }
        });

        testCases.parallelStream().forEach(testCase -> {
            if (testCase.documentId >= documents.size()) {
                return;
            }
            String doc = documents.get(testCase.documentId);
            String term = testCase.term;
            double tfidf = tfIdfCalculate(doc, documents.size(), term, docFrequencyMap);
            tfidfScores.computeIfAbsent(term, k -> new ConcurrentHashMap<>()).put(testCase.documentId, tfidf);
        });

        return tfidfScores;
    }

    private static double tfIdfCalculate(String doc, int totalDocs, String term, Map<String, Long> docFrequencyMap) {
        double tf = tf(doc, term);
        double idf = idf(totalDocs, docFrequencyMap.getOrDefault(term, 0L));
        return tf * idf;
    }

    private static double tf(String doc, String term) {
        Map<String, Long> termFrequencyMap = Arrays.stream(doc.split(WHITESPACE))
                .collect(Collectors.groupingBy(w -> w, Collectors.counting()));
        long termCount = termFrequencyMap.getOrDefault(term, 0L);
        long totalCount = termFrequencyMap.values().stream().mapToLong(Long::longValue).sum();
        return (double) termCount / totalCount;
    }

    private static double idf(int totalDocs, long docFrequency) {
        return Math.log((double) totalDocs / (docFrequency != 0 ? docFrequency : 1));
    }

    private static void saveTFIDFScores(Map<String, Map<Integer, Double>> tfidfScores, List<TestCase> testCases, String outputFilePath) throws IOException {
        try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(outputFilePath))) {
            for (TestCase testCase : testCases) {
                String term = testCase.term;
                int docId = testCase.documentId;
                double score = tfidfScores.getOrDefault(term, Collections.emptyMap()).getOrDefault(docId, 0.0);
                writer.write(String.format(docId + ":" + term + ":%.5f", score) + WHITESPACE);
            }
            writer.newLine();
        }
    }
}

class TestCase {
    String term;
    Integer documentId;

    TestCase(String term, Integer documentId) {
        this.term = term;
        this.documentId = documentId;
    }
}
Leave a Comment