Untitled

mail@pastecode.io avatar
unknown
c_cpp
2 years ago
6.3 kB
2
Indexable
Never
#define FILE_EXTENSION ".txt"
#include<fstream>
#include<string>
#include<cstring>
#include<vector>
#include<iostream>
#include<unordered_set>

using namespace std;

struct TrieNode {
	struct TrieNode *children[26];
	bool isEnd;
};

struct TrieNode *getNode(void) {
	struct TrieNode *pNode = new TrieNode;
	pNode->isEnd = false;
	for (int i = 0; i < 26; i++) {
		pNode->children[i] = NULL;
	}
	return pNode;
};

typedef struct _Node {
	string title;
	struct TrieNode *content = getNode();
	struct _Node *next;
}Node;





void insert (struct TrieNode *root, string key) {
	struct TrieNode *pCrawl = root;

	for (int i = 0; i < key.length(); i++) {
		int index = key[i] - 'a';
		if (!pCrawl->children[index]) {
			pCrawl->children[index] = getNode();
		}
		pCrawl = pCrawl->children[index];
	}

	pCrawl->isEnd = true;
}

bool search(struct TrieNode *root, string key) {
	struct TrieNode *pCrawl = root;

	for (int i = 0; i < key.length(); i++) {
		int index = key[i] - 'a';
		if (!pCrawl->children[index]) return false;

		pCrawl = pCrawl->children[index];
	}

	return (pCrawl->isEnd);
}

// Utility Func

unordered_set<string> word_parse(unordered_set<string> tmp_string) {
	unordered_set<string> parse_string;
	for (auto& word : tmp_string) {
		string new_str;
		for (auto &ch : word) {
			if (isalpha(ch))
				new_str.push_back(ch);
		}
		parse_string.insert(new_str);
	}
	return parse_string;
}

unordered_set<string> split(const string& str, const string& delim) {
	unordered_set<string> res;
	if ("" == str) return res;
	
	char * strs = new char[str.length() + 1];
	strcpy(strs, str.c_str());

	char * d = new char[delim.length() + 1];
	strcpy(d, delim.c_str());

	char *p = strtok(strs, d);
	while(p) {
		string s = p;
		res.insert(s);
		p = strtok(NULL, d);
	}

	return res;
}

// string parser : output vector of strings (words) after parsing
// vector<string> query_word_parse(vector<string> tmp_string){
// 	vector<string> parse_string;
// 	for(auto& word : tmp_string){
// 		string new_str;
//     	for(auto &ch : word){
// 			if(isalpha(ch))
// 				new_str.push_back(ch);
// 		}
// 		parse_string.emplace_back(new_str);
// 	}
// 	return parse_string;
// }



vector<string> query_split(const string& str, const string& delim) {
	vector<string> res;
	if("" == str) return res;
	//先將要切割的字串從string型別轉換為char*型別
	char * strs = new char[str.length() + 1] ; //不要忘了
	strcpy(strs, str.c_str());

	char * d = new char[delim.length() + 1];
	strcpy(d, delim.c_str());

	char *p = strtok(strs, d);
	while(p) {
		string s = p; //分割得到的字串轉換為string型別
		res.push_back(s); //存入結果陣列
		p = strtok(NULL, d);
	}

	return res;
}


int main(int argc, char *argv[])
{

    // INPUT :
	// 1. data directory in data folder
	// 2. number of txt files
	// 3. output route

	int num = 0;

    string data_dir = argv[1] + string("/");
	string query = string(argv[2]);
	string output = string(argv[3]);

	

	// Read File & Parser Example


	
	fstream fi, query_fi;
	Node *pretxt = NULL;
	pretxt->next = NULL;
	Node *head = pretxt;
	string file, title_name, tmp;
	unordered_set<string> tmp_string;

	while(1) {
		
		string num_str = to_string(num);
		string data_num = data_dir + num_str + FILE_EXTENSION;
		fi.open(data_num, ios::in);

		if (fi.fail()) break;

		Node *newtxt;
		newtxt->next = NULL;
		pretxt->next = newtxt;

		getline(fi, title_name);

		newtxt->title = title_name;
		// newtxt->content = getNode();

    	// GET TITLENAME WORD ARRAY
    	tmp_string = split(title_name, " ");

		for (auto &word : word_parse(tmp_string)) {
			insert(newtxt->content, word);
		}


		// for(auto &word : title){
		// 	cout << word << endl;
		// }

    	// GET CONTENT LINE BY LINE
		while(getline(fi, tmp)){

     	    // GET CONTENT WORD VECTOR
			tmp_string = split(tmp, " ");

			// PARSE CONTENT
			for (auto &word : word_parse(tmp_string)) {
				insert (newtxt->content, word);
			}

			// for(auto &word : content){
			// 	cout << word << endl;
			// }
			
		}
		pretxt = newtxt;

    	// CLOSE FILE
		fi.close();

		num++;
	}
	


	query_fi.open(query, ios::in);

	while(getline(query_fi, tmp)){
		int flag = 0;
		int plusflag = 0;
		vector<string> query_word;
		vector<Node *> ans;

        // GET CONTENT WORD VECTOR
		query_word = query_split(tmp, " ");

		// PARSE CONTENT

		for(auto &word : query_word){
			if (word == "/") {
				continue;
			}

			if (word == "+") {
				plusflag = 1;
				continue;
			}

			int found;
			Node *tmpNode = head->next;

			if (!ans.empty() && plusflag) {
				if (*(word.begin()) == '\"') {
					word.erase(word.begin());
					word.pop_back();
				}

				int count = 0;

				for (auto &node : ans) {
					if (!search(node->content, word)) {
						ans.erase(ans.begin()+count);
					}
					count++;
				}

				if (ans.empty()) flag = 0;
				plusflag = 0;

				
			}
			else if (ans.empty() && plusflag) {
				plusflag = 0;
				continue;
			}
			else {
				if (*(word.begin()) == '\"') {
					word.erase(word.begin());
					word.pop_back();
				}

				while(tmpNode) {
					if (search(tmpNode->content, word)) {
						ans.push_back(tmpNode);
						flag = 1;
					}
				} 
			}

		}
		if (ans.empty()) {
			cout << "Not Found!\n";
		}
		else {
			for (auto &node : ans) {
				cout << node->title << endl;
			}
		}
		//......
	}

	query_fi.close();


	// from data_dir get file ....
	// eg : use 0.txt in data directory

	//fi.open("data/0.txt", ios::in);

    // GET TITLENAME

	return 0;
	
}


// 1. UPPERCASE CHARACTER & LOWERCASE CHARACTER ARE SEEN AS SAME.
// 2. FOR SPECIAL CHARACTER OR DIGITS IN CONTENT OR TITLE -> PLEASE JUST IGNORE, YOU WONT NEED TO CONSIDER IT.
//    EG : "AB?AB" WILL BE SEEN AS "ABAB", "I AM SO SURPRISE!" WILL BE SEEN AS WORD ARRAY AS ["I", "AM", "SO", "SURPRISE"].
// 3. THE OPERATOR IN "QUERY.TXT" IS LEFT ASSOCIATIVE
//    EG : A + B / C == (A + B) / C

//

//////////////////////////////////////////////////////////