博主帮一个朋友做论文,要分析知乎的问答数据 ,数据量不多 ,因此简单用selenium中关于鼠标下滑window.scrollTo方法爬取了知乎的‘’抑郁症“专题相关问答

一 、python开头引入的模块


import requests,json,random

try:
   import
cookielib
except:
   import
http.cookiejar as cookielib
import os.path
try:
   from
PIL import Image
except:
   pass
import
time,csv,xml,re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # options模块中调用Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

chrome_options = Options() # 实例化Option对象
chrome_options.add_argument('--headless') # Chrome浏览器设置为静默模式
driver = webdriver.Chrome() # 设置引擎为Chrome,在后台默默运行#options = chrome_options



二 、知乎python登陆验证

# # 构造 Request headers
#
从配置表获取
ua_list = [
       
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3510.2 Safari/537.36",
       
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.8.3.16721",
       
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
       
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
       
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
       
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
       
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
       
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0',
       
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
       
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0',
       
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)',
       
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
       
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
   ]
ua = random.choice(ua_list)
headers = {

   
"Host": "www.zhihu.com",

   
"Referer": "https://www.zhihu.com/topic/19564862/hot",

   
'User-Agent': str(ua)
}
#代理ip
代理ip  json格式的url
url_ip=''
resp = requests.get(url=url_ip)
proxies_list=[]
if resp.status_code == 200:
   
data_json = resp.json()
   
for d in data_json['obj']:
       
port = d['port']
       
ip = d['ip']
       
full_ip = ip + ':' + port
       dict ={'http':full_ip}
       
proxies_list.append(dict)
proxies ,
   
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
}
# 使用登录cookie信息

session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename=cookie_file)

try:
   
session.cookies.load(ignore_discard=True)
except:
   
print("Cookie 未能加载")


def get_xsrf():
   
'''_xsrf 是一个动态变化的参数'''

   
index_url = 'https://www.zhihu.com'

   
# 获取登录时需要用到的_xsrf

   
index_page = session.get(index_url, headers=header)

   
# html = index_page.cookies

   # pattern = r'name="_xsrf" value="(.*?)"'
   #
   # #
这里的_xsrf 返回的是一个list
   #
   # _xsrf = re.findall(pattern, html)
   
xsrf = index_page.request._cookies.get("_xsrf")
   
return xsrf

# 获取验证码

def get_captcha():
   
t = str(int(time.time() * 1000))

   
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"

   
r = session.get(captcha_url, headers=header)

   
with open('captcha.jpg', 'wb') as f:

       
f.write(r.content)

       
f.close()

   
# pillow Image 显示验证码

   
# 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入

   
try:

       
im = Image.open('captcha.jpg')

       
im.show()

       
im.close()

   
except:

       
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))

   
captcha = input("please input the captcha\n>")

   
return captcha

def isLogin():
   
# 通过查看用户个人信息来判断是否已经登录

   
url = "https://www.zhihu.com/settings/profile"

   
login_code = session.get(url, headers=header, allow_redirects=False).status_code

   if login_code == 200:

       return True

   else:

       return False

def
login(secret, account):
   
# 通过输入的用户名判断是否是手机号

   
if re.match(r"^1\d{10}$", account):

       
print("手机号登录 \n")

       
post_url = 'https://www.zhihu.com/login/phone_num'

       
postdata = {
           
'_xsrf': get_xsrf(),

           
'password': secret,

           
'remember_me': 'true',

           
'phone_num': account,

       }

   
else:

       if
"@" in account:

           
print("邮箱登录 \n")

       
else:

           
print("你的账号输入有问题,请重新登录")

           
return 0

       
post_url = 'https://www.zhihu.com/login/email'

       
postdata
本文版权归趣KUAI排www.SEOguruBlog.com 所有,如有转发请注明来出,竞价开户托管,seo优化请联系QQ→61910465