..
unknown
plain_text
a year ago
5.7 kB
5
Indexable
mport java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
public class TFIDFCalculator {
public static final String WHITESPACE = " ";
public static final String LETTERS_ONLY = "[^a-zA-Z]";
public static final String EXTRA_SPACES = "\\s+";
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.out.println("Usage: java TFIDFCalculator docs.txt tc<No.>.txt");
return;
}
String docsFilePath = args[0];
String tcFilePath = args[1];
String outputFilePath = "output.txt";
List<String> documents = splitDocuments(docsFilePath);
List<TestCase> testCases = readTestCases(tcFilePath);
Map<String, Map<Integer, Double>> tfidfScores = calculateTFIDFScores(documents, testCases);
saveTFIDFScores(tfidfScores, testCases, outputFilePath);
}
private static List<String> splitDocuments(String docsFilePath) throws IOException {
List<String> documents = new ArrayList<>();
StringBuilder currentDocument = new StringBuilder();
try (BufferedReader reader = Files.newBufferedReader(Paths.get(docsFilePath))) {
String line;
int lineCounter = 0;
while ((line = reader.readLine()) != null) {
line = line.replaceAll(LETTERS_ONLY, WHITESPACE)
.replaceAll(EXTRA_SPACES, WHITESPACE)
.toLowerCase().trim();
currentDocument.append(WHITESPACE).append(line);
lineCounter++;
if (lineCounter % 5 == 0) {
documents.add(currentDocument.toString().trim());
currentDocument.setLength(0);
}
}
if (!currentDocument.isEmpty()) {
documents.add(currentDocument.toString().trim());
}
}
return documents;
}
private static List<TestCase> readTestCases(String tcFilePath) throws IOException {
List<String> lines = Files.readAllLines(Paths.get(tcFilePath));
if (lines.size() < 2) {
throw new IllegalArgumentException("tc_file must contain at least two lines");
}
List<String> terms = Arrays.asList(lines.get(0).split(WHITESPACE));
List<String> docIds = Arrays.asList(lines.get(1).split(WHITESPACE));
int maxLength = Math.min(terms.size(), docIds.size());
List<TestCase> testCases = new ArrayList<>(maxLength);
for (int i = 0; i < maxLength; i++) {
testCases.add(new TestCase(terms.get(i), Integer.parseInt(docIds.get(i))));
}
return testCases;
}
private static Map<String, Map<Integer, Double>> calculateTFIDFScores(List<String> documents, List<TestCase> testCases) {
Map<String, Map<Integer, Double>> tfidfScores = new ConcurrentHashMap<>();
Map<String, Long> docFrequencyMap = new HashMap<>();
documents.parallelStream().forEach(doc -> {
Set<String> uniqueTerms = Arrays.stream(doc.split(WHITESPACE)).collect(Collectors.toSet());
synchronized (docFrequencyMap) {
uniqueTerms.forEach(term -> docFrequencyMap.put(term, docFrequencyMap.getOrDefault(term, 0L) + 1));
}
});
testCases.parallelStream().forEach(testCase -> {
if (testCase.documentId >= documents.size()) {
return;
}
String doc = documents.get(testCase.documentId);
String term = testCase.term;
double tfidf = tfIdfCalculate(doc, documents.size(), term, docFrequencyMap);
tfidfScores.computeIfAbsent(term, k -> new ConcurrentHashMap<>()).put(testCase.documentId, tfidf);
});
return tfidfScores;
}
private static double tfIdfCalculate(String doc, int totalDocs, String term, Map<String, Long> docFrequencyMap) {
double tf = tf(doc, term);
double idf = idf(totalDocs, docFrequencyMap.getOrDefault(term, 0L));
return tf * idf;
}
private static double tf(String doc, String term) {
Map<String, Long> termFrequencyMap = Arrays.stream(doc.split(WHITESPACE))
.collect(Collectors.groupingBy(w -> w, Collectors.counting()));
long termCount = termFrequencyMap.getOrDefault(term, 0L);
long totalCount = termFrequencyMap.values().stream().mapToLong(Long::longValue).sum();
return (double) termCount / totalCount;
}
private static double idf(int totalDocs, long docFrequency) {
return Math.log((double) totalDocs / (docFrequency != 0 ? docFrequency : 1));
}
private static void saveTFIDFScores(Map<String, Map<Integer, Double>> tfidfScores, List<TestCase> testCases, String outputFilePath) throws IOException {
try (BufferedWriter writer = Files.newBufferedWriter(Paths.get(outputFilePath))) {
for (TestCase testCase : testCases) {
String term = testCase.term;
int docId = testCase.documentId;
double score = tfidfScores.getOrDefault(term, Collections.emptyMap()).getOrDefault(docId, 0.0);
writer.write(String.format(docId + ":" + term + ":%.5f", score) + WHITESPACE);
}
writer.newLine();
}
}
}
class TestCase {
String term;
Integer documentId;
TestCase(String term, Integer documentId) {
this.term = term;
this.documentId = documentId;
}
}Editor is loading...
Leave a Comment