| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- from selenium import webdriver
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.common.by import By
- from pyquery import PyQuery as pq
- from bs4 import BeautifulSoup
- from time import sleep
- import config
- import util
- import db_items
- # 商品总数
- products_have_get_num = 0
- def get_items(url):
- options = webdriver.ChromeOptions()
- options.add_experimental_option('excludeSwitches', ['enable-automation'])
- browser = webdriver.Chrome(options=options)
- wait = WebDriverWait(browser, 15)
- browser.maximize_window()
- browser.get(url=url)
- util.login(browser)
- sleep(10)
- tasks = db_items.get_task_items()
- for task in tasks:
- task_id = task[0]
- supplier = task[1]
- category_first = task[2]
- url = task[3]
- browser.get(url=url)
- sleep(10)
- products_total = get_products_total(browser)
- pages = get_pages(browser)
- sleep(20)
- print("total")
- print(products_total)
- for page in range(1, pages + 1):
- if page > 1:
- get_more_page(browser)
- sleep(20)
- get_products_item(browser, supplier, category_first, task_id)
- if pages == 0:
- get_products_item(browser, supplier, category_first, task_id)
- db_items.update_task_items(task_id)
- browser.close()
- def get_more_page(browser):
- try:
- # browser.find_element_by_class_name("next").click()
- page = browser.find_element_by_partial_link_text(u'下一页')
- browser.execute_script("arguments[0].scrollIntoView(false);", page)
- WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, u'下一页'))).click()
- finally:
- print("完毕")
- def get_products_total(browser):
- products_total = ""
- try:
- WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "offer-stat")))
- products_total = browser.find_element_by_class_name("offer-stat").find_element_by_tag_name("span").find_element_by_tag_name("em").text
- products_total = int(products_total)
- print(products_total)
- except:
- print('get_products_total_timeout')
- return products_total
- def get_pages(browser):
- pages = 0
- try:
- WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "page-count")))
- pages = browser.find_element_by_class_name("page-count").text
- pages = int(pages)
- print(pages)
- except:
- print("no_page_count")
- return pages
- def get_products_item(browser, supplier, category_first, task_id):
- global products_have_get_num
- # WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "search-bar")))
- parent = browser.find_element_by_id("search-bar")
- items = parent.find_elements_by_class_name("title-link")
- prices = parent.find_elements_by_class_name("price-container")
- images = parent.find_elements_by_class_name("image")
- for index in range(len(items)):
- products_have_get_num += 1
- print('*' * 50)
- item = items[index]
- # price = prices[index].text
- image = images[index].find_element_by_tag_name("img")
- title = item.text
- url = item.get_attribute("href")
- image_url = image.get_attribute("src")
- print(title)
- # print(price)
- print(url)
- print(image_url)
- goods = {
- 'supplier': supplier,
- 'category_first': category_first,
- 'url': url,
- 'title': title,
- 'spider_item_id': task_id
- }
- db_items.insert_spider_data(goods)
- print(' (●ˇ∀ˇ●) ' * 5)
- print('一共%d条数据' % products_have_get_num)
|