Untitled
unknown
c_cpp
a year ago
7.8 kB
4
Indexable
#define FILE_EXTENSION ".txt" #include<fstream> #include<string> #include<cstring> #include<vector> #include<iostream> #include<bitset> #include <filesystem> #include<stdio.h> #pragma GCC optimize("Ofast") #pragma GCC optimize("-ffast-math") #pragma GCC optimize("unroll-loops") #pragma GCC optimize("-ftree-tail-merge") #define MAX 5000005 #define DATA_MAX 10005 using namespace std; namespace fs = std::filesystem; int num = 0; inline int charToIndex(char& c) { if (c >= 'A' && c <= 'Z') return c - 'A'; else if (c >= 'a' && c <= 'z') return c - 'a'; return -1; } // class Trie { // public: int trie[MAX][26]; bitset<DATA_MAX> ed[MAX], suffix[MAX], prefix[MAX]; bitset<DATA_MAX> NOT_FOUND; // Trie() {} inline void insert(string& word, int &index) { int root = 0; int len = word.size(); for(char &c : word){ int p = charToIndex(c); if(trie[root][p] == 0) trie[root][p]= ++num; root = trie[root][p]; prefix[root][index]=1; } ed[root][index] = 1; root = 0; for(int i = len-1; ~i; --i) { int p = charToIndex(word[i]); if(trie[root][p] == 0) trie[root][p] = ++num; root = trie[root][p]; suffix[root][index] = 1; } } bitset<DATA_MAX> WildCardSearch(string &word, int index, int root) { int len = word.size(); if(len == 3 && word[1] == '*') { bitset<DATA_MAX> result; result.set(); return result; } if (word[index] == '*') { while (index+1 < len && word[index+1] == '*') { ++index; // avoid multiple '*' } if (index == len-2) { return prefix[root]; } bitset<DATA_MAX> result; for (int i = 0; i < 26; ++i) { if (trie[root][i] != 0) { result |= ((WildCardSearch(word, index, trie[root][i]) | WildCardSearch(word, index + 1, root))); } } return result; } else { int charIndex = charToIndex(word[index]); if (trie[root][charIndex] == 0) { return NOT_FOUND; } if(index == len-2) { return ed[trie[root][charIndex]]; } return WildCardSearch(word, index + 1, trie[root][charIndex]); } } bitset<DATA_MAX> SuffixSearch(string& word) { int root = 0; int len = word.size(); for(int i = len-1; ~i; --i) { if(isalpha(word[i])) { int index = charToIndex(word[i]); if(trie[root][index] == 0) return NOT_FOUND; root = trie[root][index]; } } return suffix[root]; } bitset<DATA_MAX> search(string& word, bool pre = false) { int root = 0; for(auto &c: word) { if(isalpha(c)) { int index = charToIndex(c); if(trie[root][index] == 0) return NOT_FOUND; root = trie[root][index]; } } if(! pre) return ed[root]; return prefix[root]; } bitset<DATA_MAX> startsWith(string& pre) { return search(pre, true); } // }; // Utility Func // string parser : output vector of strings (words) after parsing vector<string> word_parse(vector<string> tmp_string){ vector<string> parse_string; for(auto& word : tmp_string){ string new_str; for(auto &ch : word){ if(isalpha(ch)) new_str += ch; } parse_string.emplace_back(new_str); } return parse_string; } vector<string> split(const string& str, const string& delim) { vector<string> res; if("" == str) return res; char * strs = new char[str.length() + 1] ; strcpy(strs, str.c_str()); char * d = new char[delim.length() + 1]; strcpy(d, delim.c_str()); char *p = strtok(strs, d); while(p) { string s = p; res.push_back(s); p = strtok(nullptr, d); } return res; } inline int checkType(string& s) { if(s == "/" || s == "-" || s == "+") // operator return 4; else if(s[0] == '"') { return 0; // exact word } else if(s[0] == '*' && s[s.size()-1] == '*') { return 2; // suffix word } else if(s[0] == '<' && s[s.size()-1] == '>') { return 3; // wildcard search } else { return 1; // prefix } } int main(int argc, char *argv[]) { string data_dir = argv[1] + string("/"); string query = string(argv[2]); ifstream fi(query, ios::in); string output = string(argv[3]); string file, title_name, tmp, q_tmp; FILE *outputFile = freopen(output.c_str(), "w", stdout); vector<string> tmp_string; int cnt = 0; // count of data const int EXACT = 0; const int PREFIX = 1; const int SUFFIX = 2; const int WILDCARD = 3; // GET TITLENAME WORD ARRAY vector<string> title; char str[MAX]; // Trie* tr = new Trie(); while (1) { string current_file_path = data_dir + to_string(cnt) + ".txt"; if(!freopen(current_file_path.c_str(), "r", stdin)) break; fgets_unlocked(str, MAX, stdin); title.emplace_back(str); vector<string> tmp_title = word_parse(split(str, " ")); for(auto &word : tmp_title) { insert(word, cnt); } while (fgets_unlocked(str, MAX, stdin)) { // GET CONTENT WORD VECTOR vector<string> tmp_string = split(str, " "); // PARSE CONTENT vector<string> content = word_parse(tmp_string); for (auto &word : content) { insert(word, cnt); } } ++cnt; //fclose(inputFile); } bitset<DATA_MAX> ans(0); bitset<DATA_MAX> title_tmp(0); // int n = 1; while(getline(fi, q_tmp)) { // get seperate query int merge_type = -1; vector<string> q = split(q_tmp, " "); // cout << n++<< endl; for(auto &it_query : q) { // iterate all the query int type = checkType(it_query); if(type == 4) { if(it_query == "/") { merge_type = 0; } else if(it_query == "-") { merge_type = 1; } else if(it_query == "+") { merge_type = 2; } continue; } switch (type) { case EXACT: title_tmp = search(it_query); break; case PREFIX: title_tmp = startsWith(it_query); break; case SUFFIX: title_tmp = SuffixSearch(it_query); break; case WILDCARD: title_tmp = WildCardSearch(it_query, 1, 0); break; } // if(type == EXACT) { // title_tmp = search(it_query); // } // else if(type == PREFIX) { // title_tmp = startsWith(it_query); // } // else if(type == SUFFIX) { // title_tmp = SuffixSearch(it_query); // } // else if(type == WILDCARD) { // title_tmp = WildCardSearch(it_query, 1, 0); // } switch (merge_type) { case 0: ans |= title_tmp; break; case 1: ans &= ~title_tmp; break; case 2: ans &= title_tmp; break; case -1: ans = title_tmp; break; } // if(merge_type == 0) { // OR // ans |= title_tmp; // } // else if(merge_type == 1) { // NOT // ans &= ~title_tmp; // } // else if(merge_type == 2) { // AND // ans &= title_tmp; // } // else { // -1, don't need to merge // ans = title_tmp; // } title_tmp.reset(); merge_type = -1; } bool found = false; for(int i=ans._Find_first();i< ans.size();i = ans._Find_next(i)) { for(auto &c: title[i]) { putchar_unlocked(c); found = true; } } if(found == false) fputs_unlocked("Not Found!\n", stdout); ans.reset(); } fclose(outputFile); } // 1. UPPERCASE CHARACTER & LOWERCASE CHARACTER ARE SEEN AS SAME. // 2. FOR SPECIAL CHARACTER OR DIGITS IN CONTENT OR TITLE -> PLEASE JUST IGNORE, YOU WONT NEED TO CONSIDER IT. // EG : "AB?AB" WILL BE SEEN AS "ABAB", "I AM SO SURPRISE!" WILL BE SEEN AS WORD ARRAY AS ["I", "AM", "SO", "SURPRISE"]. // 3. THE OPERATOR IN "QUERY.TXT" IS LEFT ASSOCIATIVE // EG : A + B / C == (A + B) / C // //////////////////////////////////////////////////////////
Editor is loading...
Leave a Comment