crawle.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. from selenium import webdriver
  2. from selenium.webdriver.support.wait import WebDriverWait
  3. from selenium.webdriver.support import expected_conditions as EC
  4. from selenium.webdriver.common.by import By
  5. from pyquery import PyQuery as pq
  6. from bs4 import BeautifulSoup
  7. import mongo
  8. browser = webdriver.Chrome()
  9. wait = WebDriverWait(browser, 15)
  10. browser = webdriver.Chrome()
  11. def crawle(key, page):
  12. url = 'https://www.1688.com/'
  13. browser.get(url=url)
  14. button = browser.find_element_by_class_name('identity-cancel')
  15. button.click()
  16. input = browser.find_element_by_id('alisearch-keywords')
  17. input.send_keys(key)
  18. sea_button = browser.find_element_by_id('alisearch-submit')
  19. sea_button.click()
  20. button_1 = browser.find_element_by_class_name('s-overlay-close-l')
  21. button_1.click()
  22. button_deal = browser.find_elements_by_css_selector('.sm-widget-sort.fd-clr.s-widget-sortfilt li')[1]
  23. button_deal.click()
  24. try:
  25. browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  26. browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  27. wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#offer60')))
  28. except:
  29. print('*' * 30, '超时加载', '*' * 30, '\n\n\n')
  30. if page > 1:
  31. for page in range(2, page + 1):
  32. get_more_page(key, page)
  33. def get_more_page(key, page):
  34. page_input = browser.find_element_by_class_name('fui-paging-input')
  35. page_input.clear()
  36. page_input.send_keys(page)
  37. button = browser.find_element_by_class_name('fui-paging-btn')
  38. button.click()
  39. try:
  40. browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  41. browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  42. wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#offer60')))
  43. except:
  44. print('*' * 30, '超时加载', '*' * 30, '\n\n\n')
  45. for item in get_products():
  46. print(item)
  47. # save_to_mongo(item, key)
  48. def get_products():
  49. html = browser.page_source
  50. doc = pq(html)
  51. items = doc('.sm-offer .fd-clr .sm-offer-item').items()
  52. index = 0
  53. for item in items:
  54. index += 1
  55. print('*' * 50)
  56. title = item.find('.s-widget-offershopwindowtitle').text().split('\n')
  57. title = ' '.join(title)
  58. price_a = item.find('.s-widget-offershopwindowprice').text().split('\n')
  59. price = ''.join(price_a[:2])
  60. deal = ''.join(price_a[2:])
  61. # 产品网址
  62. text = item.find('.s-widget-offershopwindowtitle')
  63. soup = BeautifulSoup(str(text), 'lxml')
  64. a = soup.select('.s-widget-offershopwindowtitle a')[0]
  65. url = a['href']
  66. print(title)
  67. print(price)
  68. print(deal)
  69. print(url)
  70. print(' (●ˇ∀ˇ●) ' * 5)
  71. print('一共%d条数据' % index)