dongguoliang
/
spider_baidutongji


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
							from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
from time import sleep
import config
import util
import db_items

# 商品总数
products_have_get_num = 0


def get_items(url):
    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    browser = webdriver.Chrome(options=options)
    wait = WebDriverWait(browser, 15)
    browser.maximize_window()
    browser.get(url=url)

    util.login(browser)
    sleep(10)
    tasks = db_items.get_task_items()
    for task in tasks:
        task_id = task[0]
        supplier = task[1]
        category_first = task[2]
        url = task[3]
        browser.get(url=url)
        sleep(10)
        products_total = get_products_total(browser)
        pages = get_pages(browser)
        sleep(20)
        print("total")
        print(products_total)
        for page in range(1, pages + 1):
            if page > 1:
                get_more_page(browser)
                sleep(20)
            get_products_item(browser, supplier, category_first, task_id)
        if pages == 0:
            get_products_item(browser, supplier, category_first, task_id)

        db_items.update_task_items(task_id)

    browser.close()


def get_more_page(browser):
    try:
        # browser.find_element_by_class_name("next").click()
        page = browser.find_element_by_partial_link_text(u'下一页')
        browser.execute_script("arguments[0].scrollIntoView(false);", page)
        WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, u'下一页'))).click()
    finally:
        print("完毕")


def get_products_total(browser):
    products_total = ""
    try:
        WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "offer-stat")))
        products_total = browser.find_element_by_class_name("offer-stat").find_element_by_tag_name("span").find_element_by_tag_name("em").text
        products_total = int(products_total)
        print(products_total)
    except:
        print('get_products_total_timeout')

    return products_total


def get_pages(browser):
    pages = 0
    try:
        WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "page-count")))
        pages = browser.find_element_by_class_name("page-count").text
        pages = int(pages)
        print(pages)
    except:
        print("no_page_count")
    return pages


def get_products_item(browser, supplier, category_first, task_id):
    global products_have_get_num
    # WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "search-bar")))
    parent = browser.find_element_by_id("search-bar")
    items = parent.find_elements_by_class_name("title-link")
    prices = parent.find_elements_by_class_name("price-container")
    images = parent.find_elements_by_class_name("image")
    for index in range(len(items)):
        products_have_get_num += 1
        print('*' * 50)
        item = items[index]
        # price = prices[index].text
        image = images[index].find_element_by_tag_name("img")
        title = item.text
        url = item.get_attribute("href")
        image_url = image.get_attribute("src")
        print(title)
        # print(price)
        print(url)
        print(image_url)
        goods = {
            'supplier': supplier,
            'category_first': category_first,
            'url': url,
            'title': title,
            'spider_item_id': task_id
        }
        db_items.insert_spider_data(goods)

    print('	(●ˇ∀ˇ●)	' * 5)
    print('一共%d条数据' % products_have_get_num)