Untitled
unknown
c_cpp
3 years ago
6.3 kB
9
Indexable
#define FILE_EXTENSION ".txt"
#include<fstream>
#include<string>
#include<cstring>
#include<vector>
#include<iostream>
#include<unordered_set>
using namespace std;
struct TrieNode {
struct TrieNode *children[26];
bool isEnd;
};
struct TrieNode *getNode(void) {
struct TrieNode *pNode = new TrieNode;
pNode->isEnd = false;
for (int i = 0; i < 26; i++) {
pNode->children[i] = NULL;
}
return pNode;
};
typedef struct _Node {
string title;
struct TrieNode *content;
struct _Node *next;
}Node;
void insert (struct TrieNode *root, string key) {
struct TrieNode *pCrawl = root;
for (int i = 0; i < key.length(); i++) {
int index = key[i] - 'a';
if (!pCrawl->children[index]) {
pCrawl->children[index] = getNode();
}
pCrawl = pCrawl->children[index];
}
pCrawl->isEnd = true;
}
bool search(struct TrieNode *root, string key) {
struct TrieNode *pCrawl = root;
for (int i = 0; i < key.length(); i++) {
int index = key[i] - 'a';
if (!pCrawl->children[index]) return false;
pCrawl = pCrawl->children[index];
}
return (pCrawl->isEnd);
}
// Utility Func
unordered_set<string> word_parse(unordered_set<string> tmp_string) {
unordered_set<string> parse_string;
for (auto& word : tmp_string) {
string new_str;
for (auto &ch : word) {
if (isalpha(ch))
new_str.push_back(ch);
}
parse_string.insert(new_str);
}
return parse_string;
}
unordered_set<string> split(const string& str, const string& delim) {
unordered_set<string> res;
if ("" == str) return res;
char * strs = new char[str.length() + 1];
strcpy(strs, str.c_str());
char * d = new char[delim.length() + 1];
strcpy(d, delim.c_str());
char *p = strtok(strs, d);
while(p) {
string s = p;
res.insert(s);
p = strtok(NULL, d);
}
return res;
}
// string parser : output vector of strings (words) after parsing
// vector<string> query_word_parse(vector<string> tmp_string){
// vector<string> parse_string;
// for(auto& word : tmp_string){
// string new_str;
// for(auto &ch : word){
// if(isalpha(ch))
// new_str.push_back(ch);
// }
// parse_string.emplace_back(new_str);
// }
// return parse_string;
// }
vector<string> query_split(const string& str, const string& delim) {
vector<string> res;
if("" == str) return res;
//先將要切割的字串從string型別轉換為char*型別
char * strs = new char[str.length() + 1] ; //不要忘了
strcpy(strs, str.c_str());
char * d = new char[delim.length() + 1];
strcpy(d, delim.c_str());
char *p = strtok(strs, d);
while(p) {
string s = p; //分割得到的字串轉換為string型別
res.push_back(s); //存入結果陣列
p = strtok(NULL, d);
}
return res;
}
int main(int argc, char *argv[])
{
// INPUT :
// 1. data directory in data folder
// 2. number of txt files
// 3. output route
int num = 0;
string data_dir = argv[1] + string("/");
string query = string(argv[2]);
string output = string(argv[3]);
// Read File & Parser Example
fstream fi;
Node *pretxt = new Node;
pretxt->next = NULL;
Node *head = pretxt;
string file, title_name, tmp;
unordered_set<string> tmp_string;
while(1) {
string num_str = to_string(num);
string data_num = data_dir + num_str + FILE_EXTENSION;
fi.open(data_num, ios::in);
if (fi.fail()) break;
Node *newtxt = new Node;
newtxt->next = NULL;
pretxt->next = newtxt;
newtxt->content = getNode();
getline(fi, title_name);
newtxt->title = title_name;
// newtxt->content = getNode();
// GET TITLENAME WORD ARRAY
tmp_string = split(title_name, " ");
for (auto &word : word_parse(tmp_string)) {
insert(newtxt->content, word);
}
// for(auto &word : title){
// cout << word << endl;
// }
// GET CONTENT LINE BY LINE
while(getline(fi, tmp)){
// GET CONTENT WORD VECTOR
tmp_string = split(tmp, " ");
// PARSE CONTENT
for (auto &word : word_parse(tmp_string)) {
insert (newtxt->content, word);
}
// for(auto &word : content){
// cout << word << endl;
// }
}
pretxt = newtxt;
// CLOSE FILE
fi.close();
num++;
}
fi.open(query, ios::in);
while(getline(fi, tmp)){
int flag = 0;
int plusflag = 0;
vector<string> query_word;
vector<Node *> ans;
// GET CONTENT WORD VECTOR
query_word = query_split(tmp, " ");
// PARSE CONTENT
for(auto &word : query_word){
if (word == "/") {
continue;
}
if (word == "+") {
plusflag = 1;
continue;
}
int found;
Node *tmpNode = head->next;
if (!ans.empty() && plusflag) {
if (*(word.begin()) == '\"') {
word.erase(word.begin());
word.pop_back();
}
int count = 0;
for (auto &node : ans) {
if (!search(node->content, word)) {
ans.erase(ans.begin()+count);
}
count++;
}
if (ans.empty()) flag = 0;
plusflag = 0;
}
else if (ans.empty() && plusflag) {
plusflag = 0;
continue;
}
else {
if (*(word.begin()) == '\"') {
word.erase(word.begin());
word.pop_back();
}
while(tmpNode) {
if (search(tmpNode->content, word)) {
ans.push_back(tmpNode);
flag = 1;
}
}
}
}
if (ans.empty()) {
cout << "Not Found!\n";
}
else {
for (auto &node : ans) {
cout << node->title << endl;
}
}
//......
}
fi.close();
// from data_dir get file ....
// eg : use 0.txt in data directory
//fi.open("data/0.txt", ios::in);
// GET TITLENAME
return 0;
}
// 1. UPPERCASE CHARACTER & LOWERCASE CHARACTER ARE SEEN AS SAME.
// 2. FOR SPECIAL CHARACTER OR DIGITS IN CONTENT OR TITLE -> PLEASE JUST IGNORE, YOU WONT NEED TO CONSIDER IT.
// EG : "AB?AB" WILL BE SEEN AS "ABAB", "I AM SO SURPRISE!" WILL BE SEEN AS WORD ARRAY AS ["I", "AM", "SO", "SURPRISE"].
// 3. THE OPERATOR IN "QUERY.TXT" IS LEFT ASSOCIATIVE
// EG : A + B / C == (A + B) / C
//
//////////////////////////////////////////////////////////
Editor is loading...