"""
Scans the ARC cache directory, but in another process
in order to avoid blocking the twisted reactor.

This is done in a not-so-nice way, where we create a python
in a temporary file and execute that program.
"""
import os
import tempfile

from twisted.python import log
from twisted.internet import reactor, defer, protocol
from twisted.protocols import basic

ARC_CONF = '/etc/arc.conf'
DATA_CACHE_SUBDIR = 'data'


SCAN_PROGRAM = '''#generated by arc cacheindex
import os

for dirpath, dirnames, filenames in os.walk("%s"):
    for filename in filenames:
        if filename.endswith('.meta'):
            url = dirpath.rsplit('/')[-1] + filename.split('.')[0]
            print url + "\\r\\n",
'''


class URLReceiver(basic.LineReceiver):

    def __init__(self, filter):
        self.filter = filter

    def lineReceived(self, line):
        self.filter(line.strip())



class ScanProtocol(protocol.ProcessProtocol):

    def __init__(self, filter, d):
        self.url_receiver = URLReceiver(filter)
        self.d = d

    def outReceived(self, data):
        self.url_receiver.dataReceived(data)

    def errReceived(self, data):
        log.msg("Error data received from scanning program. Oh noes")

    def processEnded(self, reason):
        if reason.value.exitCode == 0:
            self.d.callback(None)
            return # everything is just peachy

        log.err(reason)
        self.d.callback(reason)



def getARCCacheDirs():

    cache_dirs = []
    config = ARC_CONF
    if 'ARC_CONFIG' in os.environ:
        config = os.environ['ARC_CONFIG']
    for line in file(config):
        if line.startswith('cachedir') or line.startswith('remotecachedir'):
            args = line.split('=', 2)[1]
            cache_dir = args.split(' ')[0].replace('"', '').strip()
            cache_dirs.append(cache_dir)
    return cache_dirs



class CacheScanner:

    def __init__(self, cache_dir=None):

        if cache_dir is None:
            cache_dir = getARCCacheDirs()

        # compat with older configs, where cache_dir is a string
        # but we need to support multiple cache dirs
        if type(cache_dir) is str:
            cache_dir = [cache_dir]

        self.cache_dir = cache_dir


    def dir(self):
        return self.cache_dir


    def scan(self, filter):

        defs = []
        for cd in self.cache_dir:

            program = SCAN_PROGRAM % cd

            tf = tempfile.NamedTemporaryFile()
            tf.write(program)
            # ensure file content is in kernel before spawning process
            tf.flush()

            d = defer.Deferred()
            pp = ScanProtocol(filter, d)
            pt = reactor.spawnProcess(pp, 'python', args=['python', tf.name])

            def err(failure):
                log.err(failure)
                return failure

            def passthru(result, _):
                return result

            d.addErrback(err)
            # The semantics of the temporary file is that it will automatically
            # get deleted once it gets garbage collected. This means that if we
            # don't use the tf variable or set the delete flag to False, the
            # file will get deleted before we start using it. Unfortuantely
            # Python 2.5 and earlier does not support the delete flag, so
            # instead we keep the variable for the temporary file in use,
            # dealying its deletion until the filter has been generated, hence
            # the bogus passthru.
            d.addBoth(passthru, tf)
            defs.append(d)

        return defer.DeferredList(defs)



@defer.inlineCallbacks
def main():
    import sys, time

    #cache_dirs = sys.argv[2:]
    #print "Cache dirs", cache_dirs

    class Count:
        def __init__(self):
            self.count = 0
        def gotHash(self, hash):
            print hash
            self.count += 1

    c = Count()


    t0 = time.time()
    #yield CacheScanner(cache_dirs).scan(c.gotHash)
    yield CacheScanner().scan(c.gotHash)
    td = time.time() - t0

    print "Scan time:", td
    print "Objects scanned: ", c.count

    reactor.stop()


if __name__ == '__main__':
    reactor.callWhenRunning(main)
    reactor.run()

