[pylucene-dev] PythonDirectory to avoid 2GB limit problem

Yura Smolsky info at altervisionmedia.com
Thu Aug 10 10:25:28 PDT 2006


Hello, Andi.

>> I am going to implement fully functional python directory same to Java
>> Lucene one. I will provide it here later, so you can include into PyLucene
>> if somebody will need.
AV> Great !

1. Ok. Here is the deal. I have finished the class.
This class implements FSDIrectory functionality,
but using python only to avoid 2 gb limit with gcc 3.4.6. (see PythonDirectory.py)

This is the test for it (see test_PythonDirectory2.py)

Please feel free to put this class into PyLucene distribution.

2. Everything seems to be perfect on Linux (Debian), but sometimes I do
receive random exceptions when I run testcases on Windows - about 1
time per 5 runs.

And even bigger problem. Optimize method completely does not work on
windows platform for this index (see attached archive). Use optimizeIndex.py
to reproduce problem on windows plaftorm. I got this exception:

Traceback (most recent call last):
  File "D:\workshop\index\optimizeIndex.py", line 16, in ?
    writer.optimize()
PyLucene.JavaError: java.lang.NullPointerException

Again, everything is okay on Linux.

--
Yura Smolsky,
http://altervisionmedia.com/
-------------- next part --------------
import os, sys
import PyLucene
import md5
import time

DEBUG = False

class DebugWrapper( object ):

    def __init__(self, obj ):
        self.obj = obj

    def __getattr__(self, name):
        print self.obj.__class__.__name__, self.obj.name, name
        sys.stdout.flush()
        return getattr(self.obj, name )
        
class DebugFactory( object ):
    
    def __init__(self, klass):
        self.klass = klass
        
    def __call__(self, *args, **kw):
        instance = self.klass(*args, **kw)
        return DebugWrapper( instance )

class PythonFileLock( object ):
    # safe for a multimple processes
    
    LOCK_POLL_INTERVAL = 1000
    
    def __init__(self, lockDir, lockFile):
        self.name = lockFile
        self.lockDir = lockDir
        self.lockFile = os.path.join(lockDir, lockFile)
        #print self.lockFile

    def isLocked(self):
        return os.path.exists(self.lockFile)

    def obtainTimeout( self, timeout ):
        locked = self.obtain()
        maxSleepCount = round(timeout / self.LOCK_POLL_INTERVAL)
        sleepCount = 0
        while (not locked):
            if sleepCount >= maxSleepCount:
                raise Exception("Lock obtain timed out: " + self.toString())
            time.sleep(timeout/1000)
            locked = self.obtain()
            sleepCount += 1
        return locked

    def obtain( self ):
        if not os.path.exists(self.lockDir):
            os.makedirs(self.lockDir)
        
        if self.isLocked():
            return False

        try:
            open(self.lockFile, 'w')
        except:
            return False
        else:
            return True

    def release( self ):
        os.remove(self.lockFile)
        return True
    
    def toString(self):
        return 'Lock@' + self.lockFile


class PythonFileStream(object):

    def __init__(self, name, fh, size=0L):
        self.name = name
        self.fh = fh
        self._length = size
        self.isOpen = True

    def close(self, isClone=False):
        if isClone or not self.isOpen:
            return
        self.isOpen = False
        self.fh.close()

    def seek(self, pos):
        self.fh.seek(pos)

    def read(self, length, pos):
        self.fh.seek(pos)
        return self.fh.read(length)

    def write(self, buffer):
        self.fh.write(buffer)
        self.fh.flush()
        self._length += len(buffer)

    def length(self):
        return self._length

        
class PythonFileDirectory( object ):

    LOCK_DIR = PyLucene.System.getProperty("org.apache.lucene.lockDir",
      PyLucene.System.getProperty("java.io.tmpdir"));
    
    def __init__(self, path, create=False ):
        self.path = os.path.realpath(path)
        self.name = self.path
        self._locks = {}
        self._streams = []
        if not self.LOCK_DIR:
            self.LOCK_DIR = self.path
        if create:
            self.create()

        assert os.path.isdir( path )

    def create(self):
        if not os.path.exists(self.path):
            os.makedirs(self.path)

        oldFiles = os.listdir(self.path)
        for oldFile in oldFiles:
            os.remove(os.path.join(self.path, oldFile))

        lockPrefix = self.getLockPrefix()
        tmpFiles = os.listdir(self.LOCK_DIR)
        for tmpFile in tmpFiles:
            if tmpFile.startswith(lockPrefix):
                os.remove(os.path.join(self.LOCK_DIR, tmpFile))
        
        
    def close(self):
        for s in self._streams:
            s.close()

    def createOutput(self, name ):
        file_path = os.path.join( self.path, name )
        fh = open( file_path, "w" )
        stream = PythonFileStream( name, fh )
        self._streams.append(stream)
        return stream

    def deleteFile( self, name ):
        if self.fileExists(name):
            os.unlink( os.path.join( self.path, name ) )

    def fileExists( self, name ):
        return os.path.exists( os.path.join( self.path, name ) )

    def fileLength( self, name ):
        file_path = os.path.join( self.path, name )
        return os.path.getsize( file_path )

    def fileModified( self, name ):
        file_path = os.path.join( self.path, name )
        return int( os.path.getmtime( file_path ))

    def list(self):
        return os.listdir( self.path )

    def openInput( self, name ):
        file_path = os.path.join( self.path, name )
        fh = open( file_path, 'r')
        stream = PythonFileStream( name, fh, os.path.getsize(file_path) )
        self._streams.append(stream)
        return stream

    def renameFile(self, fname, tname):
        fromName = os.path.join( self.path, fname )
        toName = os.path.join( self.path, tname )
        if os.path.exists( toName ):
            os.remove( toName )
        os.rename( fromName, toName )

    def touchFile( self, name):

        file_path = os.path.join( self.path, name )        
        fh = open( file_path, 'rw')
        c = fh.read(1)
        fh.seek(0)
        fh.write(c)
        fh.close()

    def makeLock( self, name ):
        lockDir = self.LOCK_DIR
        lockFile = self.getLockPrefix() + '-' + name
        lock = self._locks.setdefault( name, PythonFileLock(lockDir, lockFile) )
        #print lock.toString()
        return lock

    def getHexDigest(self, string):
        m = md5.new(string)
        return m.hexdigest()
    
    def getLockPrefix(self):
        dirName = os.path.realpath(self.path)
        prefix = 'lucene-' + self.getHexDigest(dirName)
        return prefix

if DEBUG:
    _globals = globals()
    _globals['PythonFileDirectory'] = DebugFactory( PythonFileDirectory )
    _globals['PythonFileStream'] = DebugFactory( PythonFileStream )
    _globals['PythonFileLock'] = DebugFactory( PythonFileLock )
    del _globals
-------------- next part --------------
#!/usr/local/bin/python

import os, sys, unittest, shutil, weakref
import test_PyLucene 
from PythonDirectory import *

"""
The Directory Implementation here is for testing purposes only, not meant
as an example of writing one, the implementation here suffers from a lack
of safety when dealing with concurrent modifications as it does away with 
the file locking in the default lucene fsdirectory implementation.
"""




class PythonDirectoryTests( unittest.TestCase,
                            test_PyLucene.Test_PyLuceneBase ):

    STORE_DIR = "testpyrepo"

    def setUp( self ):
        if not os.path.exists( self.STORE_DIR ):
            os.mkdir( self.STORE_DIR )

    def tearDown( self ):
        if os.path.exists(self.STORE_DIR):
            shutil.rmtree(self.STORE_DIR)

    def openStore( self ):
        return PythonFileDirectory( self.STORE_DIR )

    def closeStore(self, store, *args):
        for arg in args:
            if arg: arg.close()
        store.close()

    def test_IncrementalLoop( self ):
        print "Testing Indexing Incremental Looping"
        for i in range(100):
            print "indexing ", i
            sys.stdout.flush()
            self.test_indexDocument()
                       

if __name__ == "__main__":
    import sys
    if '-loop' in sys.argv:
        sys.argv.remove('-loop')
        while True:
            try:
                unittest.main()
            except:
                pass
    else:
        unittest.main()


-------------- next part --------------
A non-text attachment was scrubbed...
Name: optimize_problem.zip
Type: application/x-zip-compressed
Size: 23783 bytes
Desc: not available
Url : http://lists.osafoundation.org/pipermail/pylucene-dev/attachments/20060810/e8015eeb/optimize_problem-0001.bin


More information about the pylucene-dev mailing list