items.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. from selenium import webdriver
  2. from selenium.webdriver.support.wait import WebDriverWait
  3. from selenium.webdriver.support import expected_conditions as EC
  4. from selenium.webdriver.common.by import By
  5. from pyquery import PyQuery as pq
  6. from bs4 import BeautifulSoup
  7. from time import sleep
  8. import config
  9. import util
  10. import db_items
  11. # 商品总数
  12. products_have_get_num = 0
  13. def get_items(url):
  14. options = webdriver.ChromeOptions()
  15. options.add_experimental_option('excludeSwitches', ['enable-automation'])
  16. browser = webdriver.Chrome(options=options)
  17. wait = WebDriverWait(browser, 15)
  18. browser.maximize_window()
  19. browser.get(url=url)
  20. util.login(browser)
  21. sleep(10)
  22. tasks = db_items.get_task_items()
  23. for task in tasks:
  24. task_id = task[0]
  25. supplier = task[1]
  26. category_first = task[2]
  27. url = task[3]
  28. browser.get(url=url)
  29. sleep(10)
  30. products_total = get_products_total(browser)
  31. pages = get_pages(browser)
  32. sleep(20)
  33. print("total")
  34. print(products_total)
  35. for page in range(1, pages + 1):
  36. if page > 1:
  37. get_more_page(browser)
  38. sleep(20)
  39. get_products_item(browser, supplier, category_first, task_id)
  40. if pages == 0:
  41. get_products_item(browser, supplier, category_first, task_id)
  42. db_items.update_task_items(task_id)
  43. browser.close()
  44. def get_more_page(browser):
  45. try:
  46. # browser.find_element_by_class_name("next").click()
  47. page = browser.find_element_by_partial_link_text(u'下一页')
  48. browser.execute_script("arguments[0].scrollIntoView(false);", page)
  49. WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, u'下一页'))).click()
  50. finally:
  51. print("完毕")
  52. def get_products_total(browser):
  53. products_total = ""
  54. try:
  55. WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "offer-stat")))
  56. products_total = browser.find_element_by_class_name("offer-stat").find_element_by_tag_name("span").find_element_by_tag_name("em").text
  57. products_total = int(products_total)
  58. print(products_total)
  59. except:
  60. print('get_products_total_timeout')
  61. return products_total
  62. def get_pages(browser):
  63. pages = 0
  64. try:
  65. WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "page-count")))
  66. pages = browser.find_element_by_class_name("page-count").text
  67. pages = int(pages)
  68. print(pages)
  69. except:
  70. print("no_page_count")
  71. return pages
  72. def get_products_item(browser, supplier, category_first, task_id):
  73. global products_have_get_num
  74. # WebDriverWait(browser, config.timeout).until(EC.presence_of_element_located((By.CLASS_NAME, "search-bar")))
  75. parent = browser.find_element_by_id("search-bar")
  76. items = parent.find_elements_by_class_name("title-link")
  77. prices = parent.find_elements_by_class_name("price-container")
  78. images = parent.find_elements_by_class_name("image")
  79. for index in range(len(items)):
  80. products_have_get_num += 1
  81. print('*' * 50)
  82. item = items[index]
  83. # price = prices[index].text
  84. image = images[index].find_element_by_tag_name("img")
  85. title = item.text
  86. url = item.get_attribute("href")
  87. image_url = image.get_attribute("src")
  88. print(title)
  89. # print(price)
  90. print(url)
  91. print(image_url)
  92. goods = {
  93. 'supplier': supplier,
  94. 'category_first': category_first,
  95. 'url': url,
  96. 'title': title,
  97. 'spider_item_id': task_id
  98. }
  99. db_items.insert_spider_data(goods)
  100. print(' (●ˇ∀ˇ●) ' * 5)
  101. print('一共%d条数据' % products_have_get_num)