# -*- coding: utf-8 -*-
import requests
from gongju import format_headers
from bs4 import BeautifulSoup
import datetime
from pymongo import MongoClient
import re
from selenium import webdriver
import time


class MaYiDuanZuClass:
    def __init__(self):
        client = MongoClient(host='127.0.0.1', port=27017)
        mydb = client['Homestay']
        self.mycol = mydb['mayiduanzu_2018_10_10']
        self.today = str(datetime.date.today())
        self.tomorrow = str(self.getTomorrow())
        page = 1
        floor_price = 1
        ceiling_price = 20
        url = 'http://www.mayi.com/guangzhou/p{}-{}/{}/?d1={}&d2={}'.format(floor_price, ceiling_price, page,
                                                                            self.today, self.tomorrow)
        while True:
            while True:
                print(url)
                a = self.mayiduanzu_list(url)
                page += 1
                url = 'http://www.mayi.com/guangzhou/p{}-{}/{}/?d1={}&d2={}'.format(floor_price, ceiling_price,
                                                                                    page, self.today, self.tomorrow)
                if a is None:
                    break
            if ceiling_price == 1000:
                break
            floor_price, ceiling_price = floor_price + 20, ceiling_price + 20
            page = 1
            url = 'http://www.mayi.com/guangzhou/p{}-{}/{}/?d1={}&d2={}'.format(floor_price, ceiling_price,
                                                                                page, self.today, self.tomorrow)

    @staticmethod
    def getTomorrow():
        today = datetime.date.today()
        oneday = datetime.timedelta(days=1)
        tomorrow = today + oneday
        return tomorrow

    def mayiduanzu_list(self, url):
        header = """
        Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cache-Control: max-age=0
Connection: keep-alive
Cookie: mayi_uuid=5402018908261980184268; _ga=GA1.2.485164980.1538988011; bad_id73859f20-f357-11e6-b43e-3b18b16942dc=c7037ee1-cad5-11e8-b72a-5da442cf4db0; cto_lwid=93338721-2b36-40d9-abed-620c82ffebf9; accessId=73859f20-f357-11e6-b43e-3b18b16942dc; _gid=GA1.2.431299195.1538988039; sdtan=1; _channel=tg_baidu; _caname=pinzhuan_dz_bt; semChannelPageSign=72; _my_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22tg_baidu%22%2C%22ca_n%22%3A%22pinzhuan_dz_bt%22%2C%22ca_i%22%3A%22ad%22%7D; _keyword=; _caid=; qimo_seosource_73859f20-f357-11e6-b43e-3b18b16942dc=%E5%85%B6%E4%BB%96%E7%BD%91%E7%AB%99; qimo_seokeywords_73859f20-f357-11e6-b43e-3b18b16942dc=%E6%9C%AA%E7%9F%A5; Qs_lvt_101147=1538988038%2C1539053639; search_5402018908261980184268=""; searchkey=%u5E7F%u5DDE%26/guangzhou/%262%261539046878748%2C%u589E%u57CE%26/guangzhou_zengcheng/%262; viewhistory=*851341025*851215674*851195210*852437821*851525220*852960616*852026287*852897629*852755563*853242354*852025683*851739428*850595365*850393235*851037783*851150236*852486443*850658876*850410697*852438164*852438100; Qs_pv_101147=4404865950751231000%2C1686801514413067800%2C2810878333816636000%2C128491237306991570%2C2532501557664382500; _ip=61.141.65.40; href=http%3A%2F%2Fwww.mayi.com%2Fguangzhou%2Fp1-20%2F1%2F%3Fd1%3D2018-10-10%26d2%3D2018-10-11; pageViewNum=136; SESSION=afc8430a-94cf-4cf2-8c45-08eedd3bc1e5; sid=347173211242345; Hm_lvt_0294bbb72b1c6a6b342da076397c9af2=1538988038,1539046871,1539063151,1539135518; Hm_lpvt_0294bbb72b1c6a6b342da076397c9af2=1539135518; nice_id73859f20-f357-11e6-b43e-3b18b16942dc=36df0421-cc2d-11e8-b72a-5da442cf4db0; _gat_gtag_UA_63543541_1=1; __jsluid=eaced2a9c638ad520b3399c6f8f6a668
Host: www.mayi.com
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36
        """
        headers = format_headers(header)
        resp = requests.get(url, headers=headers)
        assert resp.status_code == 200
        soup = BeautifulSoup(resp.text, 'lxml')
        content = soup.find('dl', id='searchRoom')
        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if content is None:
            return None
        else:
            for _ in content.find_all('dd'):
                room_id = _.attrs['data']
                room_count = re.findall('>(\d+)居·', str(_))
                if room_count == []:
                    room_count = int(re.findall('>(\d+)居\+·', str(_))[0])
                else:
                    room_count = int(room_count[0])
                score = re.findall('">(.+)分', str(_))
                if score:
                    score = float(score[0])
                else:
                    score = None
                next_url = 'http://www.mayi.com' + _.find('a', target='_blank').attrs['href']
                house_name = _.find('img', class_='lazy lodgelazy').attrs['alt']
                _dict = {'house_name': house_name, 'next_url': next_url, 'score': score, 'room_count': room_count,
                         'room_id': room_id, 'create_time': create_time, 'start_time': self.today,
                         'end_time': self.tomorrow, 'crawl': False}
                self.mycol.update({"room_id": room_id}, {"$set": _dict}, upsert=True)
                print(room_id)
        return 1


class MaYiDuanZu_XiangQing_Class():
    def __init__(self):
        client = MongoClient(host='127.0.0.1', port=27017)
        mydb = client['Homestay']
        self.mycol = mydb['mayiduanzu_2018_10_10']
        self.col = mydb['mayi_2018_10_10']
        self.driver = webdriver.Chrome(
            executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
        data = self.mycol.find({'crawl': float(0)}, {'_id': 0})
        for _data in data:
            print(_data.get('next_url') + '开始')
            self.room_id = _data.get('room_id')
            self.room_count = _data.get('self.room_count')
            self.score = _data.get('score')
            self.start_time = _data.get('start_time')
            self.end_time = _data.get('end_time')
            self.mayiduanzu_xiangqing(_data.get('next_url'))

    def mayiduanzu_xiangqing(self, url):
        self.driver.get(url)
        for _count in range(100):
            if '非常抱歉，您访问的房源已下线' in self.driver.page_source:
                self.mycol.update({"room_id": self.room_id}, {"$set": {'crawl': 1}})
                print(url + '房源已下线')
                break
            if 'table fl fl_p' in self.driver.page_source:
                html = self.driver.page_source
                soup = BeautifulSoup(html, 'lxml')
                house_name = soup.find('title').text
                house_type_list = soup.find('ul', class_='table fl fl_p').find_all('li')
                for type_ in house_type_list:
                    house_type = type_.find('span').text if '房源类型' in type_.text else None
                    if house_type:
                        break
                house_facility = []
                house_facility_ = soup.find('div', class_='facility_out_box height_value clearfloat').find_all('li')
                for facility_ in house_facility_:
                    house_facility.append(facility_.text.strip())
                house_facility = ' '.join(house_facility)
                for bathroom_ in house_type_list:
                    bathroom_count = int(bathroom_.find('span').text.strip()) if '卫生间' in bathroom_.text else None
                    if bathroom_count:
                        break
                bed_count = int(re.search('(\d+)', soup.find('li', class_='w258').find('p').text).group())
                live_count = int(
                    re.search('(\d+)', soup.find('li', class_='w258').find('span', class_='fl').text).group())
                release_time = None
                house_info = soup.find('div', class_='room_he room_he_close').text.replace('\t', '').replace('\n','').replace(' ', '')\
                            if soup.find('div', class_='room_he room_he_close') else None
                order_or_not = True
                longitude = None
                latitude = None
                address_ = soup.find('div', class_='address relave').find(class_='absot room_adress').find(
                    'span').text.replace(
                    '-', '')
                address_1 = re.search('(.*?)(..区)(.*)', address_).group(1)
                address_2 = re.search('(.*?)(..区)(.*)', address_).group(2)
                address_3 = re.search('(.*?)(..区)(.*)', address_).group(3)
                address = address_1 + '市-' + address_2 + '-' + address_3
                city = re.search('city=(.*?);', str(soup)).group(1) + '市'
                province = re.search('province=(.*?);', str(soup)).group(1) + '省'
                district = address_2
                zoning_code = None
                landlord_name = soup.find('div', class_='landlordDesR').find('font').text
                registered_time = None
                original_price = float(soup.find('div', id='priceL').find('span').text)
                discount_price = None
                other_price = None
                price_etc = soup.find('div', class_='priceR').text.strip() if soup.find('div',
                                                                                        class_='priceR') else None
                highest_score = float(5)
                house_item = {
                    'hid': None,
                    'id': self.room_id,
                    'insert_time': datetime.datetime.now(),
                    'details_data': {
                        'house_details': {  # 房屋信息
                            'house_name': house_name,
                            'house_type': house_type,
                            'house_facility': house_facility,
                            'house_count': {
                                'bathroom_count': bathroom_count,
                                'room_count': self.room_count,
                                'bed_count': bed_count,
                                'live_count': live_count,
                            },
                            'release_time': release_time,
                            'house_info': house_info,
                        },
                        'order_info': {
                            'start_time': self.start_time,
                            'end_time': self.end_time,
                            'order_or_not': order_or_not,
                        },
                        'location': {  # 位置信息
                            'longitude': longitude,
                            'latitude': latitude,
                            'address': address,
                            'address_filter': None,
                            'city': city,
                            'province': province,
                            'district': district,
                            'zoning_code': zoning_code
                        },
                        'landlord': {  # 房东信息
                            'landlord_name': landlord_name,
                            'registered_time': registered_time
                        },
                        'price': {  # 住宿价格
                            'original_price': original_price,
                            'discount_price': discount_price,
                            'other_price': other_price,
                            'price_etc': price_etc
                        },
                        'evaluation': {  # 评价
                            'score': self.score,
                            'highest_score': highest_score
                        }
                    }
                }
                self.col.update({"room_id": self.room_id}, {"$set": house_item}, upsert=True)
                self.mycol.update({"room_id": self.room_id}, {"$set": {'crawl': 1}})
                break
            time.sleep(0.5)
        # if not 'class="table fl fl_p"' in self.driver.page_source:
        #     print(url + '出错')


MaYiDuanZu_XiangQing_Class()
