소스 검색

初始化

DONGGUOLIANGNEW\edy 2 년 전
커밋
0a1bde34c0
10개의 변경된 파일398개의 추가작업 그리고 0개의 파일을 삭제
  1. 8 0
      .gitignore
  2. 78 0
      auth.py
  3. 1 0
      data - 副本.json
  4. 1 0
      data-.json
  5. 1 0
      data.json
  6. 156 0
      detail.py
  7. 125 0
      geckodriver.log
  8. 11 0
      main.py
  9. 6 0
      readme.md
  10. 11 0
      setup.py

+ 8 - 0
.gitignore

@@ -0,0 +1,8 @@
+.idea/
+__pycache__/
+ali_images/
+ali_images_20191124/
+ali_images_20191126/
+ali_images_20191129/
+ali_images_20191206/
+venv/

+ 78 - 0
auth.py

@@ -0,0 +1,78 @@
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from pyquery import PyQuery as pq
+from bs4 import BeautifulSoup
+from time import sleep
+import time
+import traceback
+import json
+import os
+
+chrome_path = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe"
+
+
+# 登录成功后,记录cookie以后好用
+def record_cookie(browser):
+    try:
+        cookies = browser.get_cookies()
+        print(cookies)
+        with open('data.json', 'w') as f:
+            json.dump(cookies, f)
+    except:
+        browser.quit()
+        print("no_login_id")
+
+    # browser.quit()
+    return
+
+
+# 登录时添加记录cookies,
+def add_cookie(browser):
+    # 加cookies
+    with open('data.json', 'r') as f:
+        data = json.load(f)
+        for c in data:
+            # print(str(c))
+            if 'expiry' in c:
+                str_ex = str(c["expiry"])
+                if str_ex.find(".") > -1:
+                    list_ex = str_ex.split(".")
+                    c["expiry"] = int(list_ex[0])
+            # print(str(c))
+            browser.add_cookie(c)
+
+
+# 检测是否登录成功
+def check_login(browser):
+    login_flag=get_login_flag(browser)
+    if not login_flag is None:  # 没有登录成功
+        # 人工登录了
+        sleep(30)
+        for i in range(10):
+            print('第%d次检测登录' % (i))
+
+            login_flag=get_login_flag(browser)
+
+            if login_flag is None:  # 登录进了
+                record_cookie(browser)
+                return True
+            sleep(10)
+        return False
+
+    print('登录成功')
+    return True
+
+
+# 获取登录页面上的标记
+def get_login_flag(browser):
+    try:
+        login_flag = browser.find_element(By.ID, "check_svg__b")
+    except:
+        try:
+            login_flag = browser.find_element(By.CLASS_NAME, "qr-container")
+        except:
+            print("没有找到登录标记,说明已经登录成功")
+            return None
+    return login_flag

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1 - 0
data - 副本.json


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1 - 0
data-.json


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1 - 0
data.json


+ 156 - 0
detail.py

@@ -0,0 +1,156 @@
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+from time import sleep
+import time
+import json
+import auth
+import requests
+import main
+
+
+# webdriver 下载地址:http://chromedriver.storage.googleapis.com/index.html
+# 放置于:把文件存放在python根目录下,例如:C:\xxx\Python\Python38下
+
+def get_detail(url_login, url_detail):
+    options = webdriver.ChromeOptions()
+    options.add_experimental_option('excludeSwitches', ['enable-automation'])
+    # options.add_argument(r'--headless')
+    #     # options.add_argument(r'user-data-dir=C:\Users\Administrator\AppData\Local\Google\Chrome')
+    #     # options.add_argument('--disable-gpu')
+
+    browser = webdriver.Chrome(options=options)
+
+    try:
+        wait = WebDriverWait(browser, 15)
+        browser.maximize_window()
+        browser.implicitly_wait(3)  # 设置等待3秒后打开目标网页
+        browser.implicitly_wait(3)
+
+        # 登录页面
+        browser.get(url=url_login)
+        auth.add_cookie(browser)
+
+        # 进入数据详情页
+        browser.get(url_detail)
+        sleep(10)
+        if not auth.check_login(browser):
+            print('登录失败')
+            send_wechat_info("抖音线索登录失败")
+            return
+
+        # 进入数据详情页
+        # browser.get(url_detail)
+
+        sleep(5)
+        while True:
+            if time.localtime().tm_hour == 0:  # 这个时间点停止
+                break
+            # 切换线索推荐tab 为了刷新
+            click_sales_clue_push(browser)
+            sleep(5 * 60)  # 停止5分钟
+            # click_sales_clue_list(browser)
+            browser.get(url_detail)
+
+            # 抓取数据
+            # get_table(browser)
+            click_page(browser)
+            # 退出前更新一次cookie
+            auth.record_cookie(browser)
+
+            sleep(60 * 60*2)  # 停止5分钟
+
+        browser.quit()
+    except Exception as e:
+        print('err_detail')
+        print(e)
+        browser.quit()
+    return
+
+
+# 1:姓名,2:电话,3:线索创建时间,4:互动类型,5:线索类型,6:最新互动记录,7:线索渠道,8:来源抖音号,9:分配状态
+# 10:自动定位城市,11:手动填写地域,12:通话状态,13:最近互动时间,14:来源抖音号类型
+key_dic = {1: "name", 2: "phone", 3: "create_time", 4: "status_interact", 5: "status_clue",
+           6: "new_interact", 7: "clue_channel", 8: "from_douyin", 9: "status_distribution",
+           10: "city", 11: "city_hand", 12: "status_call", 13: "last_time", 14: "from"}
+
+
+# 点击线索推送
+def click_sales_clue_push(browser):
+   print("点击线索推荐")
+   try:
+       browser.find_element(By.ID, "PSalesCluePush").click()
+   except:
+       print("没有找到线索管理tab")
+
+   sleep(5)
+
+
+# 点击线索管理
+def click_sales_clue_list(browser):
+    browser.find_element(By.ID, "PSalesClueList").click()
+    print("点击线索管理")
+    sleep(5)
+
+
+# 判断是否需要分页
+def click_page(browser):
+    page_items = browser.find_elements(By.CLASS_NAME, "leads-pager-item")
+    num = len(page_items)
+    if num <= 3:
+        print("只有1页")
+        get_table(browser)
+        return
+
+    for page in range(1, num - 3):
+        page_items[num - 1].click()
+        sleep(10)
+        get_table(browser)
+
+
+# 读取数据
+def get_table(browser):
+    # tr html的结构如下示例:
+    # <tr>
+    #   <td>name</td> <td>phone</td>...
+    # </tr>'
+
+    tr_list = browser.find_elements(By.CLASS_NAME, "listTableRow")
+    for tr in tr_list:
+        tds = tr.find_elements(By.TAG_NAME, "td")
+        data = {}
+        for i in range(1, len(key_dic)):
+            data[key_dic[i]] = tds[i].text
+
+        # 给服务器传数据
+        send_data(data)
+
+
+# 发送数据
+def send_data(data):
+    print(data)
+    headers = {'Content-Type': 'application/json'}
+    response = requests.post(url=main.URL_SEND_DATA, headers=headers, data=json.dumps(data))
+    print(response)
+    return ''
+
+
+# 发送企业微信消息
+def send_wechat_info(message):
+    print(message)
+    data = {
+        "msgtype": "text",
+        "text": {
+            "content": message
+        }
+    }
+    headers = {'Content-Type': 'application/json'}
+    response = requests.post(url=main.URL_SEND_WECHAT_INFO, headers=headers, data=json.dumps(data))
+    print(response)
+    return ''
+
+#
+
+if __name__ == "__main__":
+    print(time.localtime().tm_hour)
+    # send_wechat_info("测试消息")

+ 125 - 0
geckodriver.log

@@ -0,0 +1,125 @@
+1574216083475	mozrunner::runner	INFO	Running command: "G:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\rust_mozprofile.8ll44itYNb0S"
+1574338834259	mozrunner::runner	INFO	Running command: "G:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\rust_mozprofileWHVnrR"
+1574339409092	mozrunner::runner	INFO	Running command: "G:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\rust_mozprofilemK1aLr"
+1574339410125	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: mozillaAddons
+1574339410125	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: telemetry
+1574339410125	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: resource://pdf.js/
+1574339410125	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: about:reader*
+JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
+1574339412090	Marionette	INFO	Listening on port 2810
+1574339412307	Marionette	WARN	TLS certificate errors will be ignored for this session
+WebGL(00000000177F3800)::ForceLoseContext
+WebGL(0000000017E24800)::ForceLoseContext
+WebGL(00000000177F4800)::ForceLoseContext
+WebGL(0000000019013000)::ForceLoseContext
+WebGL(0000000017E0F800)::ForceLoseContext
+WebGL(000000001D218800)::ForceLoseContext
+JavaScript error: https://g.alicdn.com/vip/login/0.5.65/js/login/nlogin.js?t=20151220, line 1: TypeError: b.cfg.elUserName.focus is not a function
+WebGL(000000001900B000)::ForceLoseContext
+WebGL(000000001D21F800)::ForceLoseContext
+WebGL(000000001901C000)::ForceLoseContext
+WebGL(0000000009EA7800)::ForceLoseContext
+[Child 10672, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 10672, Chrome_ChildThread] WARNING: pipe error: 109: [Child 9228, C1574341069894	Marionette	INFO	Stopped listening on port 2810
+[GPU 7896, Chrome_ChildThread] WARNING
+###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
+
+:
+###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
+
+ifest: Invalid extension permission: telemetry
+1574339538476	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: resource://pdf.js/
+1574339538476	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: about:reader*
+JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
+1574339540626	Marionette	INFO	Listening on port 2947
+1574339540648	Marionette	WARN	TLS certificate errors will be ignored for this session
+WebGL(000000001439B800)::ForceLoseContext
+WebGL(000000001AD15800)::ForceLoseContext
+WebGL(00000000178D2800)::ForceLoseContext
+WebGL(00000000178DB000)::ForceLoseContext
+WebGL(000000001AD25800)::ForceLoseContext
+WebGL(000000000D142800)::ForceLoseContext
+[Child 7832, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 7832, Chrome_ChildThrea[Child 4512, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 4512, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 10916, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 10916, Chrome_ChildThread] WARNING: 1574341072617	Marionette	INFO	Stopped listening on port 2947
+[GPU 5900, Chrome_ChildThread] WARNING: pip
+###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
+
+ion permission: about:reader*
+JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
+1574339675391	Marionette	INFO	Listening on port 3059
+1574339675641	Marionette	WARN	TLS certificate errors will be ignored for this session
+WebGL(0000000014BAE800)::ForceLoseContext
+WebGL(0000000014BBD800)::ForceLoseContext
+WebGL(000000001BF51800)::ForceLoseContext
+WebGL(000000001BF57800)::ForceLoseContext
+WebGL(0000000018CB4800)::ForceLoseContext
+WebGL(000000000CE65800)::ForceLoseContext
+JavaScript error: https://g.alicdn.com/vip/login/0.5.65/js/login/nlogin.js?t=20151220, line 1: TypeError: b.cfg.elUserName.focus is not a function
+[Parent 11600, Gecko_IOThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+WebGL(000000000B96F000)::ForceLoseContext
+WebGL(000000000B97F000)::ForceLoseContext
+WebGL(000000000B979000)::ForceLoseContext
+WebGL(000000000B967000)::ForceLoseContext
+[Child 11940, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 11940, Chrome_ChildT1574341288899	Marionette	INFO	Stopped listening on port 3059
+[GPU 10940, Chrome_ChildThread] WARNING: pipe err
+###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
+
+1574342463627	mozrunner::runner	INFO	Running command: "G:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\rust_mozprofileCTB8Fs"
+1574342464635	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: mozillaAddons
+1574342464636	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: telemetry
+1574342464636	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: resource://pdf.js/
+1574342464636	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: about:reader*
+JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
+1574342466658	Marionette	INFO	Listening on port 5547
+1574342466848	Marionette	WARN	TLS certificate errors will be ignored for this session
+WebGL(0000000013E91800)::ForceLoseContext
+WebGL(0000000019119800)::ForceLoseContext
+WebGL(00000000140DE000)::ForceLoseContext
+WebGL(000000001D20D000)::ForceLoseContext
+WebGL(0000000019114800)::ForceLoseContext
+WebGL(000000001E27E800)::ForceLoseContext
+JavaScript error: https://g.alicdn.com/vip/login/0.5.65/js/login/nlogin.js?t=20151220, line 1: TypeError: b.cfg.elUserName.focus is not a function
+WebGL(00000000140E0800)::ForceLoseContext
+WebGL(000000001E28B800)::ForceLoseContext
+WebGL(000000000B96E800)::ForceLoseContext
+WebGL(0000000019121000)::ForceLoseContext
+[Parent 9572, Gecko_IOThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 5512, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 5512, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 13596, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 13596, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task[Child 14872, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 14872, Chrome_ChildThread] WARNING1574344637002	Marionette	INFO	Stopped listening on port 5547
+[GPU 15316, Chrome_ChildThread] WARNIN
+###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
+
+
+###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
+
+1574346556740	mozrunner::runner	INFO	Running command: "G:\\Program Files (x86)\\Mozilla Firefox\\firefox.exe" "-marionette" "-foreground" "-no-remote" "-profile" "C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\rust_mozprofilesryWIf"
+1574346557774	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: mozillaAddons
+1574346557774	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: telemetry
+1574346557774	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: resource://pdf.js/
+1574346557775	addons.webextension.screenshots@mozilla.org	WARN	Loading extension 'screenshots@mozilla.org': Reading manifest: Invalid extension permission: about:reader*
+JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
+1574346559721	Marionette	INFO	Listening on port 8694
+1574346559958	Marionette	WARN	TLS certificate errors will be ignored for this session
+WebGL(0000000016FD1800)::ForceLoseContext
+WebGL(0000000016FE1800)::ForceLoseContext
+WebGL(000000001BC28800)::ForceLoseContext
+WebGL(000000001BC3A000)::ForceLoseContext
+WebGL(000000001BCA2000)::ForceLoseContext
+JavaScript error: https://g.alicdn.com/vip/login/0.5.65/js/login/nlogin.js?t=20151220, line 1: TypeError: b.cfg.elUserName.focus is not a function
+WebGL(000000001BCB7800)::ForceLoseContext
+[Child 2844, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 2844, Chrome_ChildThread] WARNING: pipe error: 109: file[Child 16896, Chrome_ChildThread] WARNING: pipe error: 109: file z:/task_1572401533/build/src/ipc/chromium/src/chrome/common/ipc_channel_win.cc, line 341
+[Child 16896, Chrome_ChildThread] WARNING: pipe 1574346587201	Marionette	INFO	Stopped listening on port 8694
+
+###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
+
+[GPU 4436, Chrome_ChildThread] WARNING
+###!!! [Child][MessageChannel::SendAndWait] Error: Channel error: cannot send/recv
+

+ 11 - 0
main.py

@@ -0,0 +1,11 @@
+import detail
+
+global STATUS
+global URL_SEND_DATA
+URL_SEND_DATA = "http://10.8.230.200:8099/v1/bpm/save_clue"
+URL_SEND_WECHAT_INFO= "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=02b34c67-6a00-4cf9-885a-6857ba056aed"
+
+if __name__ == "__main__":
+    url_detail = 'https://leads.cluerich.com/pc/sales/clue/list'
+    url_login = 'https://leads.cluerich.com/pc/auth/login'
+    detail.get_detail(url_login, url_detail)

+ 6 - 0
readme.md

@@ -0,0 +1,6 @@
+## 准备
+*** 将待抓取的商品导入到 数据库表 spider_data
+
+## 程序入口文件 main.py
+*** 用户登录:  auth.login(url_login)
+*** 开始抓取: detail.get_detail(url_detail)

+ 11 - 0
setup.py

@@ -0,0 +1,11 @@
+from distutils.core import setup
+import py2exe
+from selenium import webdriver
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+import mysql.connector
+import pandas as pd
+
+setup(console=['main.py'])
+