Untitled
unknown
c_cpp
2 years ago
9.4 kB
8
Indexable
#define FILE_EXTENSION ".txt" #include<fstream> #include<string> #include<cstring> #include<vector> #include<iostream> #include<unordered_map> #include<algorithm> #include<set> #include <filesystem> #include <unordered_set> #include <chrono> using namespace std; namespace fs = std::filesystem; class TrieNode { public: TrieNode* child[26]; TrieNode* parent[26]; // bitset<5000005> prefix(0); // bitset<5000005> suffix(0); bool terminate, terminate_parent; TrieNode() { for (int i = 0; i < 26; i++) { child[i] = nullptr; parent[i] = nullptr; } terminate_parent = false; terminate = false; } }; class Trie { public: TrieNode* root; Trie() { root = new TrieNode(); } void insert(string word) { TrieNode* p = root; TrieNode* back = root; int len = word.size(); for (auto &c : word) { int index = 0; if (c >= 'A' && c <= 'Z') index = c - 'A'; else if (c >= 'a' && c <= 'z') index = c - 'a'; if (p->child[index] == nullptr) p->child[index] = new TrieNode(); p = p->child[index]; } for(int i = len-1; i >= 0; i--) { int index = 0; if (word[i] >= 'A' && word[i] <= 'Z') index = word[i] - 'A'; else if (word[i] >= 'a' && word[i] <= 'z') index = word[i] - 'a'; if (back->parent[index] == nullptr) back->parent[index] = new TrieNode(); back = back->parent[index]; } p->terminate = true; back->terminate_parent = true; } bool WildCardSearch(string word, int index, TrieNode* cur) { int len = word.size(); cout << index << " "; // cout << word[index] << endl; if (index == len-2) { return cur->terminate; } if (word[index] == '*') { while (index+1 < len && word[index+1] == '*') { index++; // avoid multiple '*' } if (index == len-2) { return true; } bool result = false; for (int i = 0; i < 26; i++) { if (cur->child[i] != nullptr) { // cout << word[index+1] << " " << i << endl; // if (index+1 < len && (i == (word[index+1] - 'a') || i == (word[index+1] - 'A')) && WildCardSearch(word, index + 1, cur->child[i])) { // result = true; // } if (WildCardSearch(word, index + 1, cur->child[i])) { result = true; } if (WildCardSearch(word, index, cur->child[i])) { result = true; } } } return result; } else { int charIndex = 0; if (word[index] >= 'A' && word[index] <= 'Z') { charIndex = word[index] - 'A'; } else if (word[index] >= 'a' && word[index] <= 'z') { charIndex = word[index] - 'a'; } if (cur->child[charIndex] == nullptr) { return false; } return WildCardSearch(word, index + 1, cur->child[charIndex]); } } bool SuffixSearch(string word) { TrieNode* p = root; int len = word.size(); for(int i = len-1; i >= 0; i--) { if(isalpha(word[i])) { int index = 0; if(word[i] >= 'A' && word[i] <= 'Z') index = word[i]-'A'; else if(word[i] >= 'a' && word[i] <= 'z') index = word[i]-'a'; if(p->parent[index] == nullptr) return false; p = p->parent[index]; } } return true; } bool search(string word, bool prefix = false) { TrieNode* p = root; for(auto c: word) { if(isalpha(c)) { int index = 0; if(c >= 'A' && c <= 'Z') index = c-'A'; else if(c >= 'a' && c <= 'z') index = c-'a'; if(p->child[index] == nullptr) return false; p = p->child[index]; } } if(prefix == false) return p->terminate; return true; } bool startsWith(string prefix) { return search(prefix, true); } }; // Utility Func // string parser : output vector of strings (words) after parsing vector<string> word_parse(vector<string> tmp_string){ vector<string> parse_string; for(auto& word : tmp_string){ string new_str; for(auto &ch : word){ if(isalpha(ch)) new_str += ch; } parse_string.emplace_back(new_str); } return parse_string; } vector<string> split(const string& str, const string& delim) { vector<string> res; if("" == str) return res; char * strs = new char[str.length() + 1] ; strcpy(strs, str.c_str()); char * d = new char[delim.length() + 1]; strcpy(d, delim.c_str()); char *p = strtok(strs, d); while(p) { string s = p; res.push_back(s); p = strtok(nullptr, d); } return res; } int checkType(string s) { if(s[0] == '"') { return 0; // exact word } else if(s[0] == '*' && s[s.size()-1] == '*') { return 2; // suffix word } else if(s[0] == '<' && s[s.size()-1] == '>') { return 3; // wildcard search } else if(s == "/" || s == "-" || s == "+") // operator return 4; else { return 1; // prefix } } int main(int argc, char *argv[]) { // INPUT : // 1. data directory in data folder // 2. number of txt files // 3. output route auto start_time = std::chrono::high_resolution_clock::now(); string data_dir = argv[1] + string("/"); string query = string(argv[2]); ifstream fi(query, ios::in); string output = string(argv[3]); string file, title_name, tmp, q_tmp; ofstream outputFile(output); vector<string> tmp_string; int cnt = 0; // count of data const int EXACT = 0; const int PREFIX = 1; const int SUFFIX = 2; const int WILDCARD = 3; // GET TITLENAME WORD ARRAY vector<string> title; unordered_map<string, Trie*> mp; /* create trie of every data */ vector<string> file_paths; int i = 0; while (1) { std::string current_file_path = data_dir + std::to_string(i) + ".txt"; if (fs::exists(current_file_path)) { file_paths.emplace_back(current_file_path); // cout << current_file_path << endl; } else { break; } i++; } for (auto file_path : file_paths) { ifstream inputFile(file_path); getline(inputFile, title_name); title.emplace_back(title_name); cnt++; string tmp; while (getline(inputFile, tmp)) { // GET CONTENT WORD VECTOR vector<string> tmp_string = split(tmp, " "); // PARSE CONTENT vector<string> content = word_parse(tmp_string); for (auto word : content) { if (mp[title_name] == nullptr) { mp[title_name] = new Trie(); } mp[title_name]->insert(word); } } inputFile.close(); } set<int> ans_title; unordered_set<int> title_tmp; int n = 1; while(getline(fi, q_tmp)) { // get seperate query int merge_type = -1; vector<string> q = split(q_tmp, " "); outputFile << n++ <<"\n"; for(auto it_query : q) { // iterate all the query string front = "", end = ""; int type = checkType(it_query); if(type == 4) { if(it_query == "/") { merge_type = 0; } else if(it_query == "-") { merge_type = 1; } else if(it_query == "+") { merge_type = 2; } continue; } for(int i = 0; i < cnt; i++) { auto data = mp[title[i]]; if(type == EXACT) { if(data->search(it_query)) title_tmp.insert(i); } else if(type == PREFIX) { if(data->startsWith(it_query)) title_tmp.insert(i); } else if(type == SUFFIX) { if(data->SuffixSearch(it_query)) title_tmp.insert(i); } else if(type == WILDCARD) { if(data->WildCardSearch(it_query, 1, data->root)) title_tmp.insert(i); } } if(merge_type == 0) { // OR for(auto it : title_tmp) { ans_title.insert(it); } } else if(merge_type == 1) { // NOT for (auto it = ans_title.begin(); it != ans_title.end();) { if (title_tmp.count(*it) == 1) it = ans_title.erase(it); else ++it; } } else if(merge_type == 2) { // AND for (auto it = ans_title.begin(); it != ans_title.end();) { if (title_tmp.count(*it) == 0) it = ans_title.erase(it); else ++it; } } else { // -1, don't need to merge for(auto it: title_tmp) { ans_title.insert(it); } } title_tmp.clear(); merge_type = -1; } if(ans_title.empty()) { outputFile << "Not Found!\n"; } else { for(auto it : ans_title) { outputFile << title[it] << "\n"; } } ans_title.clear(); } fi.close(); outputFile.close(); auto end_time = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time); std::cout << "程式執行時間: " << duration.count()/1000 << " ms" << std::endl; } // 1. UPPERCASE CHARACTER & LOWERCASE CHARACTER ARE SEEN AS SAME. // 2. FOR SPECIAL CHARACTER OR DIGITS IN CONTENT OR TITLE -> PLEASE JUST IGNORE, YOU WONT NEED TO CONSIDER IT. // EG : "AB?AB" WILL BE SEEN AS "ABAB", "I AM SO SURPRISE!" WILL BE SEEN AS WORD ARRAY AS ["I", "AM", "SO", "SURPRISE"]. // 3. THE OPERATOR IN "QUERY.TXT" IS LEFT ASSOCIATIVE // EG : A + B / C == (A + B) / C // //////////////////////////////////////////////////////////
Editor is loading...
Leave a Comment