# Author: Oliver Steele, steele@cs.brandeis.edu # Source: http://osteele.com/sources/textlines.py # # Copyright 1998-1999 by Oliver Steele. # You have my permission to use this freely, as long as you keep the attribution # and label your changes. -- Oliver Steele """Module textlines -- a read-on-demand substitute for file.readlines() OVERVIEW Use textlines(file) instead of file.readlines() when it's not possible or desirable to read the entire file into memory at once. For example: for line in textlines(file): ... is computationally equivalent to for line in file.readlines(): ... but only reads one line at a time into memory. The argument to textlines can also be a pathname -- that is, textlines(pathname) is equivalent to textlines(open(pathname)). DETAILS textlines() returns an object of type TextFileLineIterator, which supports the len() and [index] operations and can therefore be used in for loops and as a sequence argument to map, and filter. Unlike readlines(), textlines() doesn't read the entire file into memory at once -- it reads each line as it's requested (reading it multiple times if it's requested multiple times). If lines is an object returned by a call to textlines, lines[n] for an arbitrary value is generally very inefficient (the file is scanned from the beginning, and previous computation isn't cached). However, the special case where the previous operation on lines was an evaluation of lines[n-1] is cached. This makes the idioms for line in textlines(file): ... and map(fn, textlines(file)) filter(fn, textlines(file)) roughly as efficient in time as the corresponding code that uses open(file).readlines() instead of textlines(file). The result of a call to len(lines) is also cached. The implementation class, TextFileLineIterator, is exposed so that it can be subclassed to implement additional caching schemes (for example, lineno -> string mappings could be stored in a table). """ __author__ = "Oliver Steele", 'steele@cs.brandeis.edu' __version__ = '1.0d1' # Change history: # 1.0 2/22/99 # Initial version. # 1.1d1 2/22/99 # Return object now emulates a file more fully: # - f.closed, f.mode, f.name, and f.softspace are defined (and call the basis object) # - f.close() is defined (ditto) import string def textlines(path_or_file): """Return an object that supports a subset of the sequence protocol (lines.len and lines[index]), and that can be used as a lazy (less memory-hungry) replacement for file.readlines() or open(path).readlines().""" return TextFileLineIterator(path_or_file) class TextFileLineIterator: def __init__(self, path_or_file): import types file = path_or_file if isinstance(file, types.StringType): file = open(file) self.file = file self.rewind() def __getattr__(self, name): if name in ('closed', 'mode', 'name', 'softspace'): return getattr(self.file, name) #return {'closed': lambda f:f.closed, # 'mode': lambda f:f.mode, # 'name': lambda f:f.name, # 'softspace': lambda f:f.softspace}[name] else: raise AttributeError, name def close(self): self.file.close() def rewind(self): self.file.seek(0) self.nextindex = 0 def __len__(self): if not hasattr(self, 'length'): self.rewind() length = 0 for line in self: length = length + 1 self.length = length return self.length def __getitem__(self, index): if index < self.nextindex: self.rewind() while index >= self.nextindex: line = self.file.readline() if not line: raise IndexError, "index out of range" self.nextindex = self.nextindex + 1 return line