# 对新ES，从新到旧的索引进行逐个搜索，将重复的数据（删除spider较大的,也就是新的）
# 如没加入redis,将数据塞入redis
# 这个行为以单线程永远不停歇的进行,不断的清洗ES表
import datetime
import time
import traceback

from base.utils.es_helper import es_query, es_delete
from base.utils.redis_helper import handle_redis


def doESCheak():
    start_time = int(time.time())  # 1月12日->1月7日
    now_time = start_time
    dd = 60
    while True:
        print(f"[es_cheak]{now_time - dd} -> {now_time} 当前时间为:{datetime.datetime.fromtimestamp(now_time)}")
        if 1603123200 > int(time.time()):
            print("[es_cheak]完成一轮检查,马上开始下一轮检查")
            start_time = int(time.time())  # 1月12日->1月7日
            now_time = start_time
            continue
        query = {"query": {"bool": {
            "must": [
                {
                    "range": {
                        "spider_time": {
                            "gt": now_time - dd,
                            "lt": now_time,
                        }
                    }
                }
            ]
        }}, "from": 0, "size": 10000}
        try:
            items = es_query(database="aic_ik*", query=query, table=None)
            i = 0
            for i, item in enumerate(items['hits']['hits']):
                url = item['_id']
                # 加入redis去重
                handle_redis.cache_set("news_set_es", url)
                # 检查重复性
                query2 = {"query": {"bool": {"must": [{"term": {"_id": url}}]}}, "from": 0, "size": 10000}
                item2s = es_query(database="aic_ik*", query=query2, table=None)['hits']['hits']
                if len(item2s) > 1:
                    # 需要删除新数据
                    win_item = None
                    for item in item2s:
                        if item["_source"].get("spider_time") is None:
                            # 立即删除这条数据
                            query_delete = {"query": {"bool": {"must": [{"term": {"_id": item["_id"]}}]}}}
                            es_delete(item["_index"], query_delete)
                        else:
                            if win_item is None:
                                win_item = item
                            else:
                                # 留旧的
                                if item["_index"] > win_item["_index"]:
                                    # 删除item
                                    query_delete = {"query": {"bool": {"must": [{"term": {"_id": item["_id"]}}]}}}
                                    es_delete(item["_index"], query_delete)
                                    print(f"删除 index:{item['_index']} url:{item['_id']}")
                                else:
                                    # 删除win_item
                                    query_delete = {
                                        "query": {"bool": {"must": [{"term": {"_id": win_item["_id"]}}]}}}
                                    es_delete(win_item["_index"], query_delete)
                                    print(f"删除 index:{win_item['_index']} url:{win_item['_id']}")

            print(f"[es_cheak]任务完成,共计{i}条")
            now_time -= dd
        except:
            traceback.print_exc()


if __name__ == '__main__':
    doESCheak()
