#!/usr/bin/env python

import gzip
import StringIO
import Queue
import threading
import urlparse
import sys
import argparse
import logging
import os

import boto
import boto.s3.connection

logger = logging.getLogger(os.path.basename(__file__))

logging.basicConfig(stream=sys.stderr, level=logging.INFO)


class FileProducer(threading.Thread):
    def __init__(self, keys, messages, num_attempts):
        super(FileProducer, self).__init__()
        self.keys = keys
        self.messages = messages
        self.num_attempts = num_attempts

    def run(self):
        while True:
            if self.keys.empty():
                self.messages.put(None)
                return

            key = self.keys.get()

            logger.info('Reading file %s...' % key.name)

            data = None
            for i in xrange(self.num_attempts):
                try:
                    data = key.get_contents_as_string()
                    if data:
                        break
                except Exception, e:
                    pass

            if data is None:
                logger.info('Failed %s times.' % self.num_attempts)
                continue

            try:
                if key.name.endswith('.gz'):
                    self.messages.put(gzip.GzipFile(fileobj=StringIO.StringIO(data)).read())
                else:
                    self.messages.put(data)
            except Exception, e:
                logger.exception('Error getting file %s' % key.name)

            self.keys.task_done()


def get_files(url_raw, total_threads=20, num_attempts=10):
    keys = Queue.Queue()
    messages = Queue.Queue(maxsize=total_threads)

    conn = boto.connect_s3()

    url = urlparse.urlparse(url_raw)

    logger.info('Opening %s...' % url_raw)

    bucket = conn.get_bucket(url.netloc)
    for k in bucket.list(url.path.strip('/')):
        keys.put(k)

    producers = [FileProducer(keys, messages, num_attempts) for x in xrange(total_threads)]

    for producer in producers:
        producer.start()

    threads_finished = 0

    while True:
        line = messages.get()

        if line is None:
            threads_finished += 1

            if threads_finished == total_threads:
                break
        else:
            yield line


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('-r', '--retries', type=int, help='Number of attempts', default=10)
    parser.add_argument('-w', '--workers', type=int, help='Number of concurrent downloads', default=20)
    parser.add_argument('url', type=str, help='S3 URL or local file path')

    args = parser.parse_args()

    for log_line in get_files(args.url, total_threads=args.workers, num_attempts=args.retries):
        sys.stdout.write(log_line)

    sys.exit(0)

