import re class WordIterator: """ A basic wrapper for sequential reading of words from file """ def __init__(self, filename): self.filename = filename self.word_queue = [] self.line_payload = 50 # how many lines will be loaded into the queue at one time self.__lines_count = sum(1 for line in open(filename)) self.__load_line = 1 # index of first line which haven't been already loaded into the queue self.__separators = [] def read(self): """ Returns first word from file that wasn't already read by this particular object. Return value format: (word, separator) When all words had been read, it returns None """ if len(self.word_queue) == 0: self.__load_payload() if len(self.word_queue) == 0: return None else: return self.word_queue.pop(0) def push_back(self, word): """ Pushes word from argument into the first position of the queue In case we already read a word but we want to re-read it again """ self.word_queue = [word] + self.word_queue def add_separator(self, sep): """ All words will be separated with sep separator """ if sep in self.__separators: return self.__separators.append(sep) new_queue = [] regex_arg = re.compile('(' + sep + ')') for word, sep in self.word_queue: new_word = re.sub(regex_arg, r' \1 ', word) new_words = new_word.split(' ') new_queue += self.__make_pairs(word + sep, new_words) self.word_queue = new_queue def __load_payload(self): """ Loads words from input file into the queue Format of words: (word, separator) """ index = 0 with open(self.filename, 'r') as file: for line in file: index += 1 if index < self.__load_line: continue elif index < self.__load_line + self.line_payload: self.word_queue += self.__parse_words(line) if index >= self.__lines_count: index += 1 else: break self.__load_line = index def __parse_words(self, line): """ Parses a line passed by argument using regular expressions. It separates each word on the line and stores it into list. returns list of tuples (word, separator between word and next word) """ orig_line = line if line == "\n" or line == "\r\n": return self.__make_pairs(orig_line, [line]) # All non escape occurrences of some characters are seperated to be a single 'word' line = re.sub(r'(?= len(words): start = orig_line[read_index:].find(words[i]) + len(words[i]) + read_index pairs.append((words[i], orig_line[start:])) else: start = orig_line[read_index:].find(words[i]) + len(words[i]) + read_index end = orig_line[start:].find(words[i+1]) + start pairs.append((words[i], orig_line[start: end])) read_index = end return pairs