博主帮一个朋友做论文,要分析知乎的问答数据 ,数据量不多 ,因此简单用selenium中关于鼠标下滑window.scrollTo方法爬取了知乎的‘’抑郁症“专题相关问答
一 、python开头引入的模块
import requests,json,random
try:
import cookielib
except:
import http.cookiejar as cookielib
import os.path
try:
from PIL import Image
except:
pass
import time,csv,xml,re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # 从options模块中调用Options类
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
chrome_options = Options() # 实例化Option对象
chrome_options.add_argument('--headless') # 把Chrome浏览器设置为静默模式
driver = webdriver.Chrome() # 设置引擎为Chrome,在后台默默运行#options = chrome_options
二 、知乎python登陆验证
# # 构造 Request headers
# 从配置表获取
ua_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3510.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.8.3.16721",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
]
ua = random.choice(ua_list)
headers = {
"Host": "www.zhihu.com",
"Referer": "https://www.zhihu.com/topic/19564862/hot",
'User-Agent': str(ua)
}
#代理ip
代理ip json格式的url
url_ip=''
resp = requests.get(url=url_ip)
proxies_list=[]
if resp.status_code == 200:
data_json = resp.json()
for d in data_json['obj']:
port = d['port']
ip = d['ip']
full_ip = ip + ':' + port
dict ={'http':full_ip}
proxies_list.append(dict)
proxies ,
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36',
}
# 使用登录cookie信息
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename=cookie_file)
try:
session.cookies.load(ignore_discard=True)
except:
print("Cookie 未能加载")
def get_xsrf():
'''_xsrf 是一个动态变化的参数'''
index_url = 'https://www.zhihu.com'
# 获取登录时需要用到的_xsrf
index_page = session.get(index_url, headers=header)
# html = index_page.cookies
# pattern = r'name="_xsrf" value="(.*?)"'
#
# # 这里的_xsrf 返回的是一个list
#
# _xsrf = re.findall(pattern, html)
xsrf = index_page.request._cookies.get("_xsrf")
return xsrf
# 获取验证码
def get_captcha():
t = str(int(time.time() * 1000))
captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
r = session.get(captcha_url, headers=header)
with open('captcha.jpg', 'wb') as f:
f.write(r.content)
f.close()
# 用pillow 的 Image 显示验证码
# 如果没有安装 pillow 到源代码所在的目录去找到验证码然后手动输入
try:
im = Image.open('captcha.jpg')
im.show()
im.close()
except:
print(u'请到 %s 目录找到captcha.jpg 手动输入' % os.path.abspath('captcha.jpg'))
captcha = input("please input the captcha\n>")
return captcha
def isLogin():
# 通过查看用户个人信息来判断是否已经登录
url = "https://www.zhihu.com/settings/profile"
login_code = session.get(url, headers=header, allow_redirects=False).status_code
if login_code == 200:
return True
else:
return False
def login(secret, account):
# 通过输入的用户名判断是否是手机号
if re.match(r"^1\d{10}$", account):
print("手机号登录 \n")
post_url = 'https://www.zhihu.com/login/phone_num'
postdata = {
'_xsrf': get_xsrf(),
'password': secret,
'remember_me': 'true',
'phone_num': account,
}
else:
if "@" in account:
print("邮箱登录 \n")
else:
print("你的账号输入有问题,请重新登录")
return 0
post_url = 'https://www.zhihu.com/login/email'
postdata