Untitled
unknown
c_cpp
2 years ago
6.3 kB
4
Indexable
#define FILE_EXTENSION ".txt" #include<fstream> #include<string> #include<cstring> #include<vector> #include<iostream> #include<unordered_set> using namespace std; struct TrieNode { struct TrieNode *children[26]; bool isEnd; }; struct TrieNode *getNode(void) { struct TrieNode *pNode = new TrieNode; pNode->isEnd = false; for (int i = 0; i < 26; i++) { pNode->children[i] = NULL; } return pNode; }; typedef struct _Node { string title; struct TrieNode *content = getNode(); struct _Node *next; }Node; void insert (struct TrieNode *root, string key) { struct TrieNode *pCrawl = root; for (int i = 0; i < key.length(); i++) { int index = key[i] - 'a'; if (!pCrawl->children[index]) { pCrawl->children[index] = getNode(); } pCrawl = pCrawl->children[index]; } pCrawl->isEnd = true; } bool search(struct TrieNode *root, string key) { struct TrieNode *pCrawl = root; for (int i = 0; i < key.length(); i++) { int index = key[i] - 'a'; if (!pCrawl->children[index]) return false; pCrawl = pCrawl->children[index]; } return (pCrawl->isEnd); } // Utility Func unordered_set<string> word_parse(unordered_set<string> tmp_string) { unordered_set<string> parse_string; for (auto& word : tmp_string) { string new_str; for (auto &ch : word) { if (isalpha(ch)) new_str.push_back(ch); } parse_string.insert(new_str); } return parse_string; } unordered_set<string> split(const string& str, const string& delim) { unordered_set<string> res; if ("" == str) return res; char * strs = new char[str.length() + 1]; strcpy(strs, str.c_str()); char * d = new char[delim.length() + 1]; strcpy(d, delim.c_str()); char *p = strtok(strs, d); while(p) { string s = p; res.insert(s); p = strtok(NULL, d); } return res; } // string parser : output vector of strings (words) after parsing // vector<string> query_word_parse(vector<string> tmp_string){ // vector<string> parse_string; // for(auto& word : tmp_string){ // string new_str; // for(auto &ch : word){ // if(isalpha(ch)) // new_str.push_back(ch); // } // parse_string.emplace_back(new_str); // } // return parse_string; // } vector<string> query_split(const string& str, const string& delim) { vector<string> res; if("" == str) return res; //先將要切割的字串從string型別轉換為char*型別 char * strs = new char[str.length() + 1] ; //不要忘了 strcpy(strs, str.c_str()); char * d = new char[delim.length() + 1]; strcpy(d, delim.c_str()); char *p = strtok(strs, d); while(p) { string s = p; //分割得到的字串轉換為string型別 res.push_back(s); //存入結果陣列 p = strtok(NULL, d); } return res; } int main(int argc, char *argv[]) { // INPUT : // 1. data directory in data folder // 2. number of txt files // 3. output route int num = 0; string data_dir = argv[1] + string("/"); string query = string(argv[2]); string output = string(argv[3]); // Read File & Parser Example fstream fi, query_fi; Node *pretxt = NULL; pretxt->next = NULL; Node *head = pretxt; string file, title_name, tmp; unordered_set<string> tmp_string; while(1) { char *intStr; itoa(num, intStr, 10); string num_str = string(intStr); string data_num = data_dir + num_str + FILE_EXTENSION; fi.open(data_num, ios::in); if (fi.fail()) break; Node *newtxt; newtxt->next = NULL; pretxt->next = newtxt; getline(fi, title_name); newtxt->title = title_name; // newtxt->content = getNode(); // GET TITLENAME WORD ARRAY tmp_string = split(title_name, " "); for (auto &word : word_parse(tmp_string)) { insert(newtxt->content, word); } // for(auto &word : title){ // cout << word << endl; // } // GET CONTENT LINE BY LINE while(getline(fi, tmp)){ // GET CONTENT WORD VECTOR tmp_string = split(tmp, " "); // PARSE CONTENT for (auto &word : word_parse(tmp_string)) { insert (newtxt->content, word); } // for(auto &word : content){ // cout << word << endl; // } } pretxt = newtxt; // CLOSE FILE fi.close(); } query_fi.open(query, ios::in); while(getline(query_fi, tmp)){ int flag = 0; int plusflag = 0; vector<string> query_word; vector<Node *> ans; // GET CONTENT WORD VECTOR query_word = query_split(tmp, " "); // PARSE CONTENT for(auto &word : query_word){ if (word == "/") { continue; } if (word == "+") { plusflag = 1; continue; } int found; Node *tmpNode = head->next; if (!ans.empty() && plusflag) { if (*(word.begin()) == '\"') { word.erase(word.begin()); word.pop_back(); } int count = 0; for (auto &node : ans) { if (!search(node->content, word)) { ans.erase(ans.begin()+count); } count++; } if (ans.empty()) flag = 0; plusflag = 0; } else if (ans.empty() && plusflag) { plusflag = 0; continue; } else { if (*(word.begin()) == '\"') { word.erase(word.begin()); word.pop_back(); } while(tmpNode) { if (search(tmpNode->content, word)) { ans.push_back(tmpNode); flag = 1; } } } } if (ans.empty()) { cout << "Not Found!\n"; } else { for (auto &node : ans) { cout << node->title << endl; } } //...... } query_fi.close(); // from data_dir get file .... // eg : use 0.txt in data directory //fi.open("data/0.txt", ios::in); // GET TITLENAME return 0; } // 1. UPPERCASE CHARACTER & LOWERCASE CHARACTER ARE SEEN AS SAME. // 2. FOR SPECIAL CHARACTER OR DIGITS IN CONTENT OR TITLE -> PLEASE JUST IGNORE, YOU WONT NEED TO CONSIDER IT. // EG : "AB?AB" WILL BE SEEN AS "ABAB", "I AM SO SURPRISE!" WILL BE SEEN AS WORD ARRAY AS ["I", "AM", "SO", "SURPRISE"]. // 3. THE OPERATOR IN "QUERY.TXT" IS LEFT ASSOCIATIVE // EG : A + B / C == (A + B) / C // //////////////////////////////////////////////////////////
Editor is loading...