# -*- coding: UTF-8 -*-
# lvjiyong on 2015/4/20.

"""

本文件中的方法仅为测试提供服务，gelid只专注于文本内容分析。

"""

import urllib2
from StringIO import StringIO
import gzip
from urlparse import urljoin, urlparse
import chardet
from gelid.extractors import regex

request_timeout = 60

class Request(object):
    """
    封装http请求，输出response等
    """
    def set_response(self, url):

        print(url)
        headers = dict()
        self.url = url
        if url and str(url).startswith('http'):
            if not self.headers:
                self.headers = Http.headers(url=url, cookie=self.cookies)
            html, headers = Http.get_unicode_with_header(url,  headers=self.headers, no_redirect=self.no_redirect, count=self.count, data=self.data, timeout=self.timeout, error_callback=self.err_callback)
            # get_unicode_with_header(url, headers=None, no_redirect=None, count=0, data=None, timeout=request_timeout, error_callback=None):
            self._body = html
        self.response = Response(url=url,
                                 method=self.method,
                                 headers=headers,
                                 body=self._body,
                                 cookies=headers.get('Cookie'),
                                 meta=self.meta, encoding=self._encoding)

        # 获取编码方式并解码
        if url:
            if not self._encoding:
                self._encoding = regex.match_encoding(self._body)
                if not self._encoding:
                    self._encoding = regex.match_encoding(headers.get("Content-Type", ""))
                if not self._encoding:
                    self._encoding = 'utf-8'

            self.response.encoding = self._encoding
            if self._encoding.lower() in ('gbk', 'gb2312'):
                self._encoding = 'gb18030'
            try:
                self.response._body_as_unicode = self._body.decode(self._encoding, 'ignore')
            except Exception as e:
                encode = chardet.detect(self._body)
                self.response.encoding = self._encoding = encode.get('encoding')
                self.response._body_as_unicode = self._body.decode(self._encoding, 'ignore')

    def __init__(self, url, callback=None, method='GET', headers=None, body=None, no_redirect=None,
                 cookies=None, meta=None, encoding=None, err_callback=None, data=None, timeout=request_timeout):
        self._encoding = encoding
        self.method = str(method).upper()
        self.url = url
        self.body = body
        self.callback = callback
        self.err_callback = err_callback
        self.cookies = cookies or {}
        self.headers = headers
        self.meta = dict(meta) if meta else None
        self._body = None
        self.response = None
        self.count = 0
        self.data = data
        self.timeout = timeout
        self.no_redirect = no_redirect
        # try:

        print(url)

        # except Exception as e:
        self.set_response(url=url)
        if callback:
            call = callback(self.response)
            if hasattr(call, 'next'):
                call.next()
        #     if err_callback:
        #         err_callback(e)
        #     else:
        #         raise e


class Http(object):
    @staticmethod
    def headers(url='', cookie=''):
        """
        初使化请求header
        :param url:
        :param cookie:
        :return:
        """
        _url = urlparse(url)
        header_dict = dict()
        header_dict['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0'
        header_dict['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
        header_dict['Accept-Language'] = 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3'
        header_dict['Referer'] = "http://{0}{1}".format(_url.hostname, _url.path)
        header_dict['Accept-Encoding'] = 'gzip, deflate'
        if cookie and len(cookie) > 0:
            header_dict['Cookie'] = cookie
        return header_dict

    @staticmethod
    def get_unicode_with_header(url, headers=None, no_redirect=None, count=0, data=None, timeout=request_timeout, error_callback=None):
        """
        获取请求内容，返回内容与header
        :param url:
        :param headers:
        :param no_redirect:
        :param count:
        :param data:
        :param timeout:
        :return:
        """
        html = None
        page_headers = None
        print(count)
        if not headers:
            headers = Http.headers(url)
        try:
            print(headers)
            req = urllib2.Request(url=url, headers=headers, data=data)
            page = urllib2.urlopen(req, timeout=timeout)

            if page.info().get('Content-Encoding') == 'gzip':

                buf = StringIO(page.read())
                body = gzip.GzipFile(fileobj=buf)
                html = body.read()

            else:
                html = page.read()
            # 如果有跳转，则进行跳转
            if not no_redirect and count < 2:
                if page.getcode() in [301, 302, 303, 307] and 'Location' in page.headers:
                    redirected_url = urljoin(url, page.headers['location'])
                    return Http.get_unicode_with_header(redirected_url, data=data,headers=headers, no_redirect=no_redirect,
                                            count=count + 1)
            page_headers = page.headers
            page.close()
            if len(html) < 200:

                cookie_regex = regex.search_cookie(html)

                if cookie_regex:
                    page_cookie = cookie_regex
                elif headers.get('Set-Cookie'):
                    page_cookie = headers.get('Set-Cookie')
                else:
                    page_cookie = None
                print(page_cookie)
                if page_cookie:
                    headers['Cookie'] = page_cookie
                    print(headers)
                    return Http.get_unicode_with_header(url, data=data,headers=headers, no_redirect=no_redirect,
                                                count=count + 1)


        except Exception as e:
            if error_callback:
                error_callback(e)
        print(page_headers)
        return html, page_headers

    @staticmethod
    def get_unicode(url, headers=None, no_redirect=None, count=0, data=None, timeout=request_timeout, error_callback=None):
        """
        获取网页内容
        :param url:
        :param headers:
        :param no_redirect:
        :param count:
        :param data:
        :param timeout:
        :param error_callback:
        :return:
        """
        html, headers = Http.get_unicode_with_header(url=url, headers=headers, no_redirect=no_redirect, count=count,
                                                     data=data, timeout=timeout, error_callback=error_callback)
        return html

    @staticmethod
    def get_file(url, headers=None, error_callback=None):
        """
        获取内容，只下载header中的Content-Type有image或stream标记的
        """
        download = None
        try:
            if not headers:
                headers = Http.headers(url)

            if url and str(url).startswith('http'):
                req = urllib2.Request(url=url, headers=headers)
                page = urllib2.urlopen(req)
                server_headers = page.headers
                name = 'Content-Type'
                if name in server_headers and ('image' in server_headers[name] or 'stream' in server_headers[name]):
                    download = page.read()
                page.close()
        except Exception as e:
            if error_callback:
                error_callback(e)
        return download


class Response(object):
    """
    封装输出
    """
    def __init__(self, url, method='GET', headers=None, body=None,
                 cookies=None, meta=None, encoding='utf-8'):
        self.encoding = encoding
        self.body = body
        self.url = url
        self.meta = method
        self._body_as_unicode = None
        self.cookies = cookies or {}
        self.headers = headers or dict()
        self.meta = meta if meta else None
        self._cached_selector = None

    def body_as_unicode(self):
        return self._body_as_unicode