# textopen, version 1.2.1 # # Released to the public domain 16 August 1999 by Oliver Steele, # steele@cs.brandeis.edu. """Module textopen -- read from text files that use foreign line separators USAGE from textopen import textopen file1 = textopen(pathname) is equivalent to file2 = open(pathname) except that file1.readline() will return the first line that's terminated by any of '\n' (UNIX), '\r' (MacOS), or '\r\n' (PC), whereas file2.readline() will look for a different line separator depending on the execution platform (and won't generally work for files that use a different convention). The line that's returned, in any case, will end in '\n' (or nothing, if it's an unterminated line that ends the file). RATIONALE Using open followed by readline can attempt to read the whole file if it's a non-native text file (for instance, a UNIX file on a Mac). It's therefore not generally possible to read the whole file and then look for line breaks (the file may not fit in memory), and it's inconvenient to read it in blocks. This module reads in blocks, but it isolates the inconvenience from the client programs. DETAILS textopen returns an object that supports a subset of the file protocol (readline, readlines, seek, and tell). textopen can also be applied to a file object (or any object that implements the read, seek and tell functions) -- that is, textopen(pathname) is equivalent to textopen(open(pathname)). """ __author__ = "Oliver Steele", 'steele@cs.brandeis.edu' __version__ = '1.2.1' # Change history: # 1.0 2/22/99 # Initial version. # 1.1 3/22/99 # Return object now fully emulates a file: # - f.closed, f.mode, f.name, and f.softspace are defined (and call the basis object) # - f.close() is defined (ditto) # 1.2 6/11/99 # Added an optional 'mode' argument, for compatability with open(). # Added test code. # 1.2.1 8/16/99 # Released to the public domain. # Fixed a bug where a final line that didn't end in a line separator could be # ignored. import string def textopen(pathOrFile, mode='r'): """If mode is 'r' (the default), return a file-like object based on pathOrFile (a pathname string or a file-like object) whose readline() method understands UNIX (lf), Mac (cr), and PC (crlf) separators, and returns lines that end in '\n' in all cases. If mode is 'rb' or a writing mode, textopen() is the same as open().""" if mode == 'r': return _TextFileLinefeedAdaptor(pathOrFile) elif mode == 'r+': try: exception = NotImplementedError # new in 1.5.2 except: exception = Exception raise exception, "'r+' mode is not implemented" else: return open(pathOrFile) class _TextFileLinefeedAdaptor: BLOCK_SIZE = 512 def __init__(self, pathOrFile): import types file = pathOrFile if isinstance(file, types.StringType): file = open(file, "rb") self.file = file self.buffer = None def __getattr__(self, name): if name in ('closed', 'mode', 'name', 'softspace'): return getattr(self.file, name) else: raise AttributeError, name def close(self): self.file.close() def readline(self): # prime the buffer buffer = self.buffer if not buffer: buffer = self.file.read(self.BLOCK_SIZE) if not buffer: return '' # find the earliest '\r' or '\n' crpos = string.find(buffer, '\r') lfpos = string.find(buffer, '\n') # If there's none, return the whole line plus the next block. # In the worst case (no '\r' or '\n' in the file), this recurses # filesize/BLOCK_SIZE deep. I consider this unlikely (the file # is supposed to be a text file), but if it's a problem the # recursion could be changed into a loop at a slight expense in # readability. if max(crpos, lfpos) < 0: self.buffer = None return buffer + self.readline() if lfpos < 0 or 0 <= crpos < lfpos: # '\r\n?' line = buffer[:crpos] + '\n' otherchar = '\n' nextpos = crpos + 1 else: # '\n\r?' line = buffer[:lfpos + 1] # include the '\n' otherchar = '\r' nextpos = lfpos + 1 if nextpos == len(buffer): buffer = self.file.read(self.BLOCK_SIZE) nextpos = 0 # Skip over an '\r' after an '\n' or vice versa. This interprets # '\n\r' as a newline as well as PC '\r\n', but on the Mac line # separators in a PC file opened in text mode (default or "r", as opposed # to "rb") read '\n\r', so the added generality makes the class work # as a wrapper for files that are opened in text mode outside our control. if buffer and buffer[nextpos] == otherchar: nextpos = nextpos + 1 self.buffer = buffer[nextpos:] return line def readlines(self): lines = [] while 1: line = self.readline() if line == '': break lines.append(line) return lines def read(self, length=None): self.seek(self.tell()) if length: return self.file.read(length) else: self.file.read() def rewind(self): self.seek(0) def seek(self, position): self.file.seek(position) self.buffer = None def tell(self): return self.file.tell() - len(self.buffer or "") def _test(): _testlength(1) for index in range(-2, 3): _testlength(_TextFileLinefeedAdaptor.BLOCK_SIZE + index) def _testlength(length): import tempfile import string lines = [] for line in '1', '2', '3': lines.append(line * length) separators = {'UNIX': '\n', 'MacOS': '\r', 'DOS': '\n\r'} for separator in separators.keys(): filename = tempfile.mktemp() try: file = open(filename, 'wb') # leave the separator off the last line, to test that case too file.write(string.join(lines, separators[separator])) file.close() file = textopen(filename, 'r') for index in range(len(lines)): expected = lines[index] if index != len(lines) - 1: expected = expected + '\n' actual = file.readline() if expected != actual: print 'Platform %s:\n expected: %s\n read: %s' % (separator, `expected`, `actual`) finally: try: file.close() except: pass import os os.remove(filename)