..
unknown
plain_text
10 months ago
5.7 kB
3
Indexable
mport java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; public class TFIDFCalculator { public static final String WHITESPACE = " "; public static final String LETTERS_ONLY = "[^a-zA-Z]"; public static final String EXTRA_SPACES = "\\s+"; public static void main(String[] args) throws IOException { if (args.length != 2) { System.out.println("Usage: java TFIDFCalculator docs.txt tc<No.>.txt"); return; } String docsFilePath = args[0]; String tcFilePath = args[1]; String outputFilePath = "output.txt"; List<String> documents = splitDocuments(docsFilePath); List<TestCase> testCases = readTestCases(tcFilePath); Map<String, Map<Integer, Double>> tfidfScores = calculateTFIDFScores(documents, testCases); saveTFIDFScores(tfidfScores, testCases, outputFilePath); } private static List<String> splitDocuments(String docsFilePath) throws IOException { List<String> documents = new ArrayList<>(); StringBuilder currentDocument = new StringBuilder(); try (BufferedReader reader = Files.newBufferedReader(Paths.get(docsFilePath))) { String line; int lineCounter = 0; while ((line = reader.readLine()) != null) { line = line.replaceAll(LETTERS_ONLY, WHITESPACE) .replaceAll(EXTRA_SPACES, WHITESPACE) .toLowerCase().trim(); currentDocument.append(WHITESPACE).append(line); lineCounter++; if (lineCounter % 5 == 0) { documents.add(currentDocument.toString().trim()); currentDocument.setLength(0); } } if (!currentDocument.isEmpty()) { documents.add(currentDocument.toString().trim()); } } return documents; } private static List<TestCase> readTestCases(String tcFilePath) throws IOException { List<String> lines = Files.readAllLines(Paths.get(tcFilePath)); if (lines.size() < 2) { throw new IllegalArgumentException("tc_file must contain at least two lines"); } List<String> terms = Arrays.asList(lines.get(0).split(WHITESPACE)); List<String> docIds = Arrays.asList(lines.get(1).split(WHITESPACE)); int maxLength = Math.min(terms.size(), docIds.size()); List<TestCase> testCases = new ArrayList<>(maxLength); for (int i = 0; i < maxLength; i++) { testCases.add(new TestCase(terms.get(i), Integer.parseInt(docIds.get(i)))); } return testCases; } private static Map<String, Map<Integer, Double>> calculateTFIDFScores(List<String> documents, List<TestCase> testCases) { Map<String, Map<Integer, Double>> tfidfScores = new ConcurrentHashMap<>(); Map<String, Long> docFrequencyMap = new HashMap<>(); documents.parallelStream().forEach(doc -> { Set<String> uniqueTerms = Arrays.stream(doc.split(WHITESPACE)).collect(Collectors.toSet()); synchronized (docFrequencyMap) { uniqueTerms.forEach(term -> docFrequencyMap.put(term, docFrequencyMap.getOrDefault(term, 0L) + 1)); } }); testCases.parallelStream().forEach(testCase -> { if (testCase.documentId >= documents.size()) { return; } String doc = documents.get(testCase.documentId); String term = testCase.term; double tfidf = tfIdfCalculate(doc, documents.size(), term, docFrequencyMap); tfidfScores.computeIfAbsent(term, k -> new ConcurrentHashMap<>()).put(testCase.documentId, tfidf); }); return tfidfScores; } private static double tfIdfCalculate(String doc, int totalDocs, String term, Map<String, Long> docFrequencyMap) { double tf = tf(doc, term); double idf = idf(totalDocs, docFrequencyMap.getOrDefault(term, 0L)); return tf * idf; } private static double tf(String doc, String term) { Map<String, Long> termFrequencyMap = Arrays.stream(doc.split(WHITESPACE)) .collect(Collectors.groupingBy(w -> w, Collectors.counting())); long termCount = termFrequencyMap.getOrDefault(term, 0L); long totalCount = termFrequencyMap.values().stream().mapToLong(Long::longValue).sum(); return (double) termCount / totalCount; } private static double idf(int totalDocs, long docFrequency) { return Math.log((double) totalDocs / (docFrequency != 0 ? docFrequency : 1)); } private static void saveTFIDFScores(Map<String, Map<Integer, Double>> tfidfScores, List<TestCase> testCases, String outputFilePath) throws IOException { try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(outputFilePath))) { for (TestCase testCase : testCases) { String term = testCase.term; int docId = testCase.documentId; double score = tfidfScores.getOrDefault(term, Collections.emptyMap()).getOrDefault(docId, 0.0); writer.write(String.format(docId + ":" + term + ":%.5f", score) + WHITESPACE); } writer.newLine(); } } } class TestCase { String term; Integer documentId; TestCase(String term, Integer documentId) { this.term = term; this.documentId = documentId; } }
Editor is loading...
Leave a Comment