[pylucene-dev] PythonDirectory to avoid 2GB limit problem
Yura Smolsky
info at altervisionmedia.com
Wed Aug 23 08:46:56 PDT 2006
Hello, Julien.
i will look into the problem today. thanks.
JA> Hi there,
JA> I've been trying out your new Python directory implementation for
JA> NXLucene which is using PyLucene 1.9.1
JA> We are then using gcc-3.4.6. Everything is stable for weeks now but I'm
JA> reaching the 2 go limitation of gcc-3.4.6.
JA> I saw two problems with the PythonFileLock you implemented :
JA> - It appears that it is not thread safe. Try out with something like
JA> 100 threads.
JA> I got timeout within obtainTimeout() from the PythonFileLock in some of
JA> my NXLucene multi threading tests. It seems there is an issue as soon as
JA> you got 2 kind of lock names (write / commit locks).
JA> - It appears to be really inefficient with lots of threads compared to
JA> the FSDirectory. I suspect lots of retries occurred within the
JA> obtainTimeout().
JA> I don't have really a lot of time investigating this right now. If you
JA> guys do have an idea ?
JA> I'm gonna give a try to the gcc-4.1.0 + patch solution.
JA> J.
JA> Yura Smolsky wrote:
>>>> I am going to implement fully functional python directory same to Java
>>>> Lucene one. I will provide it here later, so you can include into PyLucene
>>>> if somebody will need.
>> AV> Great !
>>
>> 1. Ok. Here is the deal. I have finished the class.
>> This class implements FSDIrectory functionality,
>> but using python only to avoid 2 gb limit with gcc 3.4.6. (see PythonDirectory.py)
>>
>> This is the test for it (see test_PythonDirectory2.py)
>>
>> Please feel free to put this class into PyLucene distribution.
>>
>> 2. Everything seems to be perfect on Linux (Debian), but sometimes I do
>> receive random exceptions when I run testcases on Windows - about 1
>> time per 5 runs.
>>
>> And even bigger problem. Optimize method completely does not work on
>> windows platform for this index (see attached archive). Use optimizeIndex.py
>> to reproduce problem on windows plaftorm. I got this exception:
>>
>> Traceback (most recent call last):
>> File "D:\workshop\index\optimizeIndex.py", line 16, in ?
>> writer.optimize()
>> PyLucene.JavaError: java.lang.NullPointerException
>>
>> Again, everything is okay on Linux.
>>
>> --
>> Yura Smolsky,
>> http://altervisionmedia.com/
>>
>>
>> ------------------------------------------------------------------------
>>
>> import os, sys
>> import PyLucene
>> import md5
>> import time
>>
>> DEBUG = False
>>
>> class DebugWrapper( object ):
>>
>> def __init__(self, obj ):
>> self.obj = obj
>>
>> def __getattr__(self, name):
>> print self.obj.__class__.__name__, self.obj.name, name
>> sys.stdout.flush()
>> return getattr(self.obj, name )
>>
>> class DebugFactory( object ):
>>
>> def __init__(self, klass):
>> self.klass = klass
>>
>> def __call__(self, *args, **kw):
>> instance = self.klass(*args, **kw)
>> return DebugWrapper( instance )
>>
>> class PythonFileLock( object ):
>> # safe for a multimple processes
>>
>> LOCK_POLL_INTERVAL = 1000
>>
>> def __init__(self, lockDir, lockFile):
>> self.name = lockFile
>> self.lockDir = lockDir
>> self.lockFile = os.path.join(lockDir, lockFile)
>> #print self.lockFile
>>
>> def isLocked(self):
>> return os.path.exists(self.lockFile)
>>
>> def obtainTimeout( self, timeout ):
>> locked = self.obtain()
>> maxSleepCount = round(timeout / self.LOCK_POLL_INTERVAL)
>> sleepCount = 0
>> while (not locked):
>> if sleepCount >= maxSleepCount:
>> raise Exception("Lock obtain timed out: " + self.toString())
>> time.sleep(timeout/1000)
>> locked = self.obtain()
>> sleepCount += 1
>> return locked
>>
>> def obtain( self ):
>> if not os.path.exists(self.lockDir):
>> os.makedirs(self.lockDir)
>>
>> if self.isLocked():
>> return False
>>
>> try:
>> open(self.lockFile, 'w')
>> except:
>> return False
>> else:
>> return True
>>
>> def release( self ):
>> os.remove(self.lockFile)
>> return True
>>
>> def toString(self):
>> return 'Lock@' + self.lockFile
>>
>>
>> class PythonFileStream(object):
>>
>> def __init__(self, name, fh, size=0L):
>> self.name = name
>> self.fh = fh
>> self._length = size
>> self.isOpen = True
>>
>> def close(self, isClone=False):
>> if isClone or not self.isOpen:
>> return
>> self.isOpen = False
>> self.fh.close()
>>
>> def seek(self, pos):
>> self.fh.seek(pos)
>>
>> def read(self, length, pos):
>> self.fh.seek(pos)
>> return self.fh.read(length)
>>
>> def write(self, buffer):
>> self.fh.write(buffer)
>> self.fh.flush()
>> self._length += len(buffer)
>>
>> def length(self):
>> return self._length
>>
>>
>> class PythonFileDirectory( object ):
>>
>> LOCK_DIR = PyLucene.System.getProperty("org.apache.lucene.lockDir",
>> PyLucene.System.getProperty("java.io.tmpdir"));
>>
>> def __init__(self, path, create=False ):
>> self.path = os.path.realpath(path)
>> self.name = self.path
>> self._locks = {}
>> self._streams = []
>> if not self.LOCK_DIR:
>> self.LOCK_DIR = self.path
>> if create:
>> self.create()
>>
>> assert os.path.isdir( path )
>>
>> def create(self):
>> if not os.path.exists(self.path):
>> os.makedirs(self.path)
>>
>> oldFiles = os.listdir(self.path)
>> for oldFile in oldFiles:
>> os.remove(os.path.join(self.path, oldFile))
>>
>> lockPrefix = self.getLockPrefix()
>> tmpFiles = os.listdir(self.LOCK_DIR)
>> for tmpFile in tmpFiles:
>> if tmpFile.startswith(lockPrefix):
>> os.remove(os.path.join(self.LOCK_DIR, tmpFile))
>>
>>
>> def close(self):
>> for s in self._streams:
>> s.close()
>>
>> def createOutput(self, name ):
>> file_path = os.path.join( self.path, name )
>> fh = open( file_path, "w" )
>> stream = PythonFileStream( name, fh )
>> self._streams.append(stream)
>> return stream
>>
>> def deleteFile( self, name ):
>> if self.fileExists(name):
>> os.unlink( os.path.join( self.path, name ) )
>>
>> def fileExists( self, name ):
>> return os.path.exists( os.path.join( self.path, name ) )
>>
>> def fileLength( self, name ):
>> file_path = os.path.join( self.path, name )
>> return os.path.getsize( file_path )
>>
>> def fileModified( self, name ):
>> file_path = os.path.join( self.path, name )
>> return int( os.path.getmtime( file_path ))
>>
>> def list(self):
>> return os.listdir( self.path )
>>
>> def openInput( self, name ):
>> file_path = os.path.join( self.path, name )
>> fh = open( file_path, 'r')
>> stream = PythonFileStream( name, fh, os.path.getsize(file_path) )
>> self._streams.append(stream)
>> return stream
>>
>> def renameFile(self, fname, tname):
>> fromName = os.path.join( self.path, fname )
>> toName = os.path.join( self.path, tname )
>> if os.path.exists( toName ):
>> os.remove( toName )
>> os.rename( fromName, toName )
>>
>> def touchFile( self, name):
>>
>> file_path = os.path.join( self.path, name )
>> fh = open( file_path, 'rw')
>> c = fh.read(1)
>> fh.seek(0)
>> fh.write(c)
>> fh.close()
>>
>> def makeLock( self, name ):
>> lockDir = self.LOCK_DIR
>> lockFile = self.getLockPrefix() + '-' + name
>> lock = self._locks.setdefault( name, PythonFileLock(lockDir, lockFile) )
>> #print lock.toString()
>> return lock
>>
>> def getHexDigest(self, string):
>> m = md5.new(string)
>> return m.hexdigest()
>>
>> def getLockPrefix(self):
>> dirName = os.path.realpath(self.path)
>> prefix = 'lucene-' + self.getHexDigest(dirName)
>> return prefix
>>
>> if DEBUG:
>> _globals = globals()
>> _globals['PythonFileDirectory'] = DebugFactory( PythonFileDirectory )
>> _globals['PythonFileStream'] = DebugFactory( PythonFileStream )
>> _globals['PythonFileLock'] = DebugFactory( PythonFileLock )
>> del _globals
>>
>>
>> ------------------------------------------------------------------------
>>
>> #!/usr/local/bin/python
>>
>> import os, sys, unittest, shutil, weakref
>> import test_PyLucene
>> from PythonDirectory import *
>>
>> """
>> The Directory Implementation here is for testing purposes only, not meant
>> as an example of writing one, the implementation here suffers from a lack
>> of safety when dealing with concurrent modifications as it does away with
>> the file locking in the default lucene fsdirectory implementation.
>> """
>>
>>
>>
>>
>> class PythonDirectoryTests( unittest.TestCase,
>> test_PyLucene.Test_PyLuceneBase ):
>>
>> STORE_DIR = "testpyrepo"
>>
>> def setUp( self ):
>> if not os.path.exists( self.STORE_DIR ):
>> os.mkdir( self.STORE_DIR )
>>
>> def tearDown( self ):
>> if os.path.exists(self.STORE_DIR):
>> shutil.rmtree(self.STORE_DIR)
>>
>> def openStore( self ):
>> return PythonFileDirectory( self.STORE_DIR )
>>
>> def closeStore(self, store, *args):
>> for arg in args:
>> if arg: arg.close()
>> store.close()
>>
>> def test_IncrementalLoop( self ):
>> print "Testing Indexing Incremental Looping"
>> for i in range(100):
>> print "indexing ", i
>> sys.stdout.flush()
>> self.test_indexDocument()
>>
>>
>> if __name__ == "__main__":
>> import sys
>> if '-loop' in sys.argv:
>> sys.argv.remove('-loop')
>> while True:
>> try:
>> unittest.main()
>> except:
>> pass
>> else:
>> unittest.main()
>>
>>
>>
>>
>> ------------------------------------------------------------------------
>>
>> _______________________________________________
>> pylucene-dev mailing list
>> pylucene-dev at osafoundation.org
>> http://lists.osafoundation.org/mailman/listinfo/pylucene-dev
--
Yura Smolsky,
http://altervisionmedia.com/
More information about the pylucene-dev
mailing list