鲅鱼圈做网站,wordpress线下安装教程,企业网站推广技巧有哪些,html精美登录界面源码最近看了看爬虫相关知识点#xff0c;做了记录#xff0c;具体代码放到了仓库#xff0c;本文仅学习使用#xff0c;如有违规请联系博主删除。
这个流程图是我使用在线AI工具infography生成的#xff0c;这个网站可以根据url或者文本等数据自动生成流程图#xff0c;挺…最近看了看爬虫相关知识点做了记录具体代码放到了仓库本文仅学习使用如有违规请联系博主删除。
这个流程图是我使用在线AI工具infography生成的这个网站可以根据url或者文本等数据自动生成流程图挺好用。 爬虫基础知识点 1.requests请求1.1.requests请求数据1.2.解析提取数据1.2.1.解析数据1.2.1.1.BeautifulSoup1.2.1.2.lxml 1.2.2.提取数据1.2.2.1.re1.2.2.2.xpath1.2.2.3 CSS 选择器 1.3.数据存储1.3.1.Mongodb1.3.2.redis1.3.3.csv1.3.4.mysql 1.4.requests获取携带cookies1.4.1.携带1.4.2.获取再携带 1.5.session维护会话1.6.多进程多线程爬取数据1.7.使用aiohttp异步爬取数据 2.scrapy框架2.1.default2.1.1.配置工程2.1.2.定义数据模型2.1.3.爬取数据2.1.4.保存数据模型到数据库2.1.5.request和response中间件配置2.1.6.获取携带cookies 2.2.crawl2.3.使用redis分布式爬取2.3.1.redis2.3.2.使用scrapy_redis爬取数据到redis2.3.3.redis到mongondb 3.selenium模拟驱动3.1orc识别数字字母验证码3.2.opencv模板匹配滑块验证码 4.js逆向4.1.response混淆加密4.1.1.js解密4.1.1.1.AESECBCBC 4.1.1.2.DES4.1.1.3.RSA4.1.1.4.MD54.1.1.5.sha256加密4.1.1.6.Base64加密 4.1.2.python解密4.1.2.1.AESECBCBC 4.1.2.2.DES4.1.2.3.Base64 4.2.hook注入反debug 1.requests请求
requests 库用于发送 HTTP 请求与 Web 服务器进行交互获取数据或者提交表单等操作。
import requests#请求#pip install requests
import re#正则解析
from lxml import etree#xpath解析
import pymongo#mongodb数据存储pip install pymongo1.1.requests请求数据
属性描述response.text获取响应的字符串 str 数据处理文本数据如 HTML、纯文本response.content获取 bytes 类型处理二进制数据如图片、文件等response.status_code获取响应的状态码response.requests_headers获取对应的请求头response.headers获取响应头response.requests_cookies获取对应请求的 cookieresponse.cookies获取响应的 cookie经过了 set - cookie 动作response.json获取到响应的 json 数据转换成字典
urlhttps://www.cheshi.com/
headers{user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36}
responserequests.get(url,headersheaders)
res.encodinggb2312#返回的页面有乱码经过搜索查到charsetgb2312所以对res进行编码转换
print(response.textresponse.status_coderesponse.headersresponse.encodingresponse.request.headers)
with open(./网上车市.html,w,encodingutf-8) as f:f.write(response.text)#写入源码1.2.解析提取数据
1.2.1.解析数据
1.2.1.1.BeautifulSoup
#使用bs4解析页面内容(select,find_all选择器)
from bs4 import BeautifulSoup
soupBeautifulSoup(html.text,lxml)titlessoup.select(.article-summary.article-title)for title in titles:print(title.select(a)[0].text)1.2.1.2.lxml
from lxml import etree
treeetree.HTML(html.text)
dls tree.xpath(//dl[contains(class, list) and contains(class, hiddenMap) and contains(class, rel)]) 1.2.2.提取数据
1.2.2.1.re
符号含义.除了\n匹配所有re.S匹配所有包含\n\w数字、字母、下划线\W 非数字、字母、下划线(\d数字\D 非数字\s空格 换行\S非空格换行[]匹配里面任意字符[^]匹配除了里面任意内容{num}匹配重复num次*匹配重复0次或多次匹配重复1次或多次惰性匹配匹配到就截断再接着去匹配剩余re.compile()预加载匹配规则分组获取数据
expre.compile(classm_detail.*?href.*?(.*?),re.S)#匹配出里的内容re.S代表匹配包括换行符
print(exp.findall(response.text))#在response.text里面匹配出规则内容返回列表1.2.2.2.xpath
符号含义./当前目录下匹配/子目录匹配//子孙目录匹配//div[class“A”]classname是A的div元素//div[contains(class, ‘A’) and contains(class, ‘B’) ]classname包含A且包含B的div元素
treeetree.HTML(response.text)
titletree.xpath(//div[classm_detail]//a/text())
print(title)1.2.2.3 CSS 选择器
submit_buttonwait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,.next-pagination-jump-go)))1.3.数据存储
1.3.1.Mongodb
clientpymongo.MongoClient(mongodb://localhost:27017)#链接客户端client
dbclient[app]#链接数据库db
coldb[C1]#链接集合Collection
objcol.insert_one(user)#增加1条数据
print(obj.inserted_id)
objscol.insert_many(users)#增加文档数据document
print(objs.inserted_ids)#返回documents的所有id
userscol.find()#查询所有
objscol.find({name:zoe})#条件查询
objscol.find({age:{$gt:20}})#年龄大于20#$regex正则匹配1.3.2.redis
import redis#pip install redis
dbredis.Redis(hostlocalhost,port6379,decode_responsesTrue)
#string 存value
db.set(name,小12)#添加1个键值对
print(db.get(name))#获取1个值
db.mset({name1:老王,age1:老衲年方28})#添加多个键值对
print(db.mget(name1,name,age1))#获取多个值
# #hash 存key-value
db.hset(hash1,key1,value1)#添加
db.hset(hash1,key2,value2)
db.hset(hash1,key3,value3)
print(db.hget(hash1,key2))#获取hash中key对应的值
print(db.hgetall(hash1))#获取hash中所有的键值对
# list 存list
db.lpush(list1,1,2,3)#倒序插入先进后出
db.rpush(list2,2,3,4,5)#顺序插入先进先出
print(db.llen(list1))#list的长度
print(db.lrange(list1,0,-1))#lrange key start stop(-1 在 Redis 中是一个特殊的索引表示列表的最后一个元素)
#set
db.sadd(set1,55,66,77,55)
print(db.scard(set1))#scard获取set的长度
print(db.smembers(set1))#smembers获取set的所有元素
#zset
db.zadd(zset1, {zoe: 22, jodie: 11})#Redis 的 Sorted Set有序集合要求每个成员member都关联一个分数score
print(db.zcard(zset1))
print(db.zrange(zset1,0,-1,withscoresTrue))#[(jodie, 11.0), (zoe, 22.0)]1.3.3.csv
import csv#数据读写csv
headers[z,f,g]#写
rows[(aa,bb,cc),(dd,rr,ww),(ff,yy,jj)]
with open(save.csv,a, newline) as f:#, w写入a追加newline表示中间不空一行f_csvcsv.writer(f)#写入缓存f_csv.writerow(headers)#写入一行f_csv.writerows(rows)#写入多行
with open(save.csv,r,encodingutf-8) as f:#读f_csvcsv.reader(f)#python内置的csv解析缓存next(f_csv)#跳过第一行titlefor i in f_csv:print(i)
import xlrd,xlwt#数据读写excel,需要pip
from docx import Document#数据读写word,需要pip install python-docx1.3.4.mysql
from sqlalchemy import create_engine
from sqlalchemy import Column, String, Integer, Text
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base# 创建基类
Base declarative_base()# 创建引擎
engine create_engine(mysqlpymysql://root:PASSWORD127.0.0.1:3306/test?charsetutf8, echoTrue)# 定义 Book 类
class Book(Base):__tablename__ bookid Column(id, Integer, primary_keyTrue, autoincrementTrue)title Column(title, String(20))info Column(info, String(30))star Column(star, String(10))pl Column(pl, String(10))introduce Column(introduce, Text())# 创建表结构
Base.metadata.create_all(engine)# 创建会话
Session sessionmaker(bindengine)
sess Session()1.4.requests获取携带cookies
1.4.1.携带
urlhttps://my.cheshi.com/user/
headers{user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36}
cookiespv_uid1732254874878; cheshi_UUID01JD96ZEWPDK2ZKK8X3WGFFPVS; cheshi_pro_cityMV%2FljJfkuqxfMV%2FkuJzln47ljLpfYmVpamluZw%3D%3D; Hm_lvt_8fe47348e12ba11be217fd389b1154721732254888,1732494411; HMACCOUNT0F938E3E8702278B; lv1732674538; vn7; Hm_lvt_ed9cf33799965fb6c868762ac84e663e1732674587; Hm_lpvt_ed9cf33799965fb6c868762ac84e663e1732674590; cheshi_tokeneyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiIsImp0aSI6ImNoZXNoaV9oNV9zaWduIn0.eyJpc3MiOiJodHRwczpcL1wvYXBpLmNoZXNoaS5jb20iLCJhdWQiOiJodHRwczpcL1wvYXBpLmNoZXNoaS5jb20iLCJqdGkiOiJjaGVzaGlfaDVfc2lnbiIsImlhdCI6MTczMjY3NDY2OSwibmJmIjoxNzMyNjc0NzI5LCJleHAiOjE3MzMyNzk0NjksInVpZCI6IjkxMDcxNDIifQ.ihAUr-0-7HEFedu-u23BlcstiaynxHrBAVDBXqnAW_E; cheshi_user_authv2MzI2NDUyNAlsaXR0bGVaCXYyCWJjZDYzMWQ4NDZlMTQ4ZWQwY2UzZThhMTFkYTE2YmQxCTE3MzI2NzQ2NjkJNDgyN2JjMTgwZjg5MzIyNDg4MDAyYzg3NjYwOGRmNTY; cheshi_user_infoOTEwNzE0MglsaXR0bGVaCXYyCWJjZDYzMWQ4NDZlMTQ4ZWQwY2UzZThhMTFkYTE2YmQxCTE3MzI2NzQ2NjkJNDgyN2JjMTgwZjg5MzIyNDg4MDAyYzg3NjYwOGRmNTYJCQl3YW5nc2hhbmdjaGVzaGk; cheshi_user_info_for_indexOTEwNzE0MglsaXR0bGVaCXYyCWJjZDYzMWQ4NDZlMTQ4ZWQwY2UzZThhMTFkYTE2YmQxCTE3MzI2NzQ2NjkJNDgyN2JjMTgwZjg5MzIyNDg4MDAyYzg3NjYwOGRmNTYJCQl3YW5nc2hhbmdjaGVzaGk; Hm_lpvt_8fe47348e12ba11be217fd389b1154721732674672; PHPSESSIDbd0f056bb72ef681c01a68b853bde882; pv_source; cheshi_user_prevLogintime1732674716; pv_cheshit1732674722341
cookies{ item.split()[0] : item.split()[1] for item in cookies.split(; )}#字符串转换成dict
print(cookies)
cookiesrequests.utils.cookiejar_from_dict(cookies)#把cookie转换成dick然后通过requests接口传递cookies
resrequests.get(url,headersheaders,cookiescookies)1.4.2.获取再携带
url_loginhttps://api.cheshi.com/services/common/api.php?apilogin.Login
data{act: login,
mobile: 18811752638,
source: pc,
password: PASSWORD,
hold_time: yes,
}
headers{user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36}
resrequests.post(urlurl_login,headersheaders,datadata)
# print(res.cookies)
urlhttps://my.cheshi.com/user/
resrequests.get(url,headersheaders,cookiesres.cookies)1.5.session维护会话
session维护会话会将获取到的cookies自动保存携带
url_loginhttps://api.cheshi.com/services/common/api.php?apilogin.Login
data{act: login,
mobile: 18811752638,
source: pc,
password: PASSWORD,
hold_time: yes,
}
headers{user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36}
sessionrequests.session()
session.post(url_login,headersheaders,datadata)#自动保存携带获取到的cookies(session没有维护headers所以下面访问时要携带headers)
urlhttps://my.cheshi.com/user/
ressession.get(url,headersheaders)
print(res.text)1.6.多进程多线程爬取数据
#每类书籍开一个进程每个进程开多个线程跑因为io密集
import urllib.parse
import requests
from bs4 import BeautifulSoup
import multiprocessing
from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
import urllib
headers{user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36,# referer:https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start60typeT
}
def get_link(url):print(当前进程{}.format(multiprocessing.Process.pid))try:response requests.get(url, headersheaders)html response.textsoup BeautifulSoup(html, lxml)books soup.select(.subject-item)for book in books:title book.select_one(.info h2 a).text.strip().replace( , ).replace(\n, )print(title)except Exception as e:print(fError fetching {url}: {e})def th(tag):#多线程with ThreadPoolExecutor(max_workers5) as excutorT:urls[]for i in range(0,300,20):tag_qurllib.parse.quote(tag)url fhttps://book.douban.com/tag/{tag_q}?start{i}typeTurls.append(url)futures[excutorT.submit(get_link,item) for item in urls]for future in futures:future.result()if __name____main__:tags[小说,文学]with ProcessPoolExecutor(max_workers2) as executorP:#多进程futures[executorP.submit(th,tag) for tag in tags]for future in futures:future.result()1.7.使用aiohttp异步爬取数据
异步是事件驱动模型异步是一种编程方式专注于非阻塞执行而多线程/多进程是一种并发模型。异步操作可以在单线程中实现也可以与多线程、多进程组合使用来增强处理能力。asynico 标记异步函数,await 等待异步函数返回。
#使用aiohttp取代requests
import aiohttp#pip install aiohttp
import asyncio
from bs4 import BeautifulSoup
headers{user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36}
async def crawl(i):print(正在爬取,i)urlhttps://xiaohua.zol.com.cn/baoxiao/{}.html.format(i)async with aiohttp.ClientSession(headersheaders) as session:async with session.get(url) as resp:# print(resp.status)textawait resp.text()soupBeautifulSoup(text,lxml)listssoup.select(.article-summary .article-title a)for list in lists:print(list.get_text())
if __name____main__:loopasyncio.get_event_loop()#开启异步tasks[crawl(i) for i in range(1,10)]loop.run_until_complete(asyncio.wait(tasks))loop.close()
2.scrapy框架
这个框架的爬虫有4给模板默认使用default还有crawl模板可以匹配网页链接并访问。
2.1.default
pip install scrapy#安装框架(scrapy自带xpath功能获取元素内容需要.get())
scrapy startproject car . #创建工程
scrapy genspider app https://product.cheshi.com/rank/2-0-0-0-1/ #在工程里面创建spider
scrapy crawl app #spider开始爬数据2.1.1.配置工程
settings.py
LOG_LEVELERROR
ROBOTSTXT_OBEY False
USER_AGENT Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36
打开ITEM_PIPELINES和MIDDLEWARES如果用到的话2.1.2.定义数据模型
items.py
class BookItem(scrapy.Item):title scrapy.Field()publisher scrapy.Field()
在app里面导入模型类(from ..items import CarItem)创建空对象然后把爬取的数据赋值给对象,最后yield item此时item对象就到了pipelines2.1.3.爬取数据
import scrapy
from ..items import BookItemclass AppSpider(scrapy.Spider):name appallowed_domains [book.douban.com]start_urls [https://book.douban.com/latest]def parse(self, response):booksresponse.xpath(//ul[classchart-dashed-list]/li)for book in books:# titlebook.xpath(.//h2[classclearfix]/a/text()).get()linkbook.xpath(.//h2[classclearfix]/a/href).get()# print(title,link)yield scrapy.Request(urllink,callbackself.parse_details)next_urlresponse.xpath(//span[classnext]/a/href).get()if next_url is not None:next_linkresponse.urljoin(next_url)print(next_link)yield scrapy.Request(urlnext_link,callbackself.parse)def parse_details(self,response):objBookItem()obj[title]response.xpath(//div[idwrapper]/h1/span/text()).get()obj[publisher]response.xpath(//div[idcontent]//div[idinfo]/a/text()).get()# print(title,publisher)yield obj2.1.4.保存数据模型到数据库
piplines.py用到piplines就需要到settings里面打开ITEM_PIPELINES
class BookPipeline:def __init__(self):self.clientpymongo.MongoClient(mongodb://localhost:27017)self.dbself.client[douban]self.colself.db[book]def process_item(self, item, spider):self.col.insert_one(dict(item))return itemdef __del__(self):print(end)2.1.5.request和response中间件配置
middlewares.py用到middlewares就需要到settings里面打开SPIDER_MIDDLEWARES或者DOWNLOADER_MIDDLEWARES 主要功能随机useragent代理ip使用selenium添加cookie主要是在DownloaderMiddleware类的process_request和process_response方法。
#随机useragent
def process_request(self, request, spider):usa[Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15,Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134]request.headers[Use-Agent]random.choice(usa)return None2.1.6.获取携带cookies
class AppSpider(scrapy.Spider):name app# allowed_domains [www.cheshi.com]# start_urls [https://my.cheshi.com/user/]def start_requests(self):#1.使用cookie直接访问urlhttps://my.cheshi.com/user/cookiespv_uid1732254874878; cheshi_UUID01JD96ZEWPDK2ZKK8X3WGFFPVS; cheshi_pro_cityMV%2FljJfkuqxfMV%2FkuJzln47ljLpfYmVpamluZw%3D%3D; Hm_lvt_8fe47348e12ba11be217fd389b1154721732254888,1732494411; HMACCOUNT0F938E3E8702278B; Hm_lvt_ed9cf33799965fb6c868762ac84e663e1732674587; PHPSESSIDbd0f056bb72ef681c01a68b853bde882; cheshi_user_prevLogintime1732674716; Hm_lpvt_ed9cf33799965fb6c868762ac84e663e1732675618; cheshi_tokeneyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiIsImp0aSI6ImNoZXNoaV9oNV9zaWduIn0.eyJpc3MiOiJodHRwczpcL1wvYXBpLmNoZXNoaS5jb20iLCJhdWQiOiJodHRwczpcL1wvYXBpLmNoZXNoaS5jb20iLCJqdGkiOiJjaGVzaGlfaDVfc2lnbiIsImlhdCI6MTczMjY3NTY2OSwibmJmIjoxNzMyNjc1NzI5LCJleHAiOjE3MzMyODA0NjksInVpZCI6IjkxMDcxNDIifQ.txhZVRGAHLpMP8whjQ3fPcgNicQmVYake_8s0J1EKzk; cheshi_user_authv2MzI2NDUyNAlsaXR0bGVaCXYyCWJjZDYzMWQ4NDZlMTQ4ZWQwY2UzZThhMTFkYTE2YmQxCTE3MzI2NzU2NjkJMzMyMjhkMGRmNDc3ZTc2YmVlMTQ0Y2JmODg1Zjk5OTY; cheshi_user_infoOTEwNzE0MglsaXR0bGVaCXYyCWJjZDYzMWQ4NDZlMTQ4ZWQwY2UzZThhMTFkYTE2YmQxCTE3MzI2NzU2NjkJMzMyMjhkMGRmNDc3ZTc2YmVlMTQ0Y2JmODg1Zjk5OTYJaHR0cHM6Ly9pbWcuY2hlc2hpLWltZy5jb20vdXNlcnBob3RvL25ldy85MTA3MTQyL2FkNzc3NzZkYmRiM2M5NGFhZTE2YWYyM2M3OGRlMjkzLmpwZwkwCXdhbmdzaGFuZ2NoZXNoaQ; cheshi_user_info_for_indexOTEwNzE0MglsaXR0bGVaCXYyCWJjZDYzMWQ4NDZlMTQ4ZWQwY2UzZThhMTFkYTE2YmQxCTE3MzI2NzU2NjkJMzMyMjhkMGRmNDc3ZTc2YmVlMTQ0Y2JmODg1Zjk5OTYJaHR0cHM6Ly9pbWcuY2hlc2hpLWltZy5jb20vdXNlcnBob3RvL25ldy85MTA3MTQyL2FkNzc3NzZkYmRiM2M5NGFhZTE2YWYyM2M3OGRlMjkzLmpwZwkwCXdhbmdzaGFuZ2NoZXNoaQ; lv1732685873; vn8; Hm_lpvt_8fe47348e12ba11be217fd389b1154721732685874; pv_cheshit1732685902509; pv_sourcecookies{ item.split()[0] : item.split()[1] for item in cookies.split(; )}yield scrapy.Request(urlurl,callbackself.parse,cookiescookies)#2.通过login获取cookieurl_loginhttps://api.cheshi.com/services/common/api.php?apilogin.Logindata{act: login,mobile: 18811752638,source: pc,password: PASSWORD,hold_time: yes,}yield scrapy.FormRequest(urlurl_login,formdatadata,callbackself.parse)def parse(self, response):# print(response.text)#这个是cookies不用设置scrapy后台自动会保存携带这个cookiesurlhttps://my.cheshi.com/user/yield scrapy.Request(urlurl,callbackself.parse_admin)def parse_admin(self,response):print(response.text)2.2.crawl
scrapy genspider -t crawl app https://seller.cheshi.com/beijing/rules匹配访问链接
class AppSpider(CrawlSpider):name appallowed_domains [seller.cheshi.com]start_urls [https://seller.cheshi.com/beijing/]rules (Rule(LinkExtractor(allowrseller.cheshi.com/\d,denyrseller.cheshi.com/\d/.), callbackparse_item, followTrue),)def parse_item(self, response):titleresponse.xpath(//div[classclearfix]//a[classname]/text()).get()print(title,response.url)
2.3.使用redis分布式爬取
2.3.1.redis
import redis#pip install redis
dbredis.Redis(hostlocalhost,port6379,decode_responsesTrue)
#string 存value
db.set(name,小12)#添加1个键值对
print(db.get(name))#获取1个值
db.mset({name1:老王,age1:老衲年方28})#添加多个键值对
print(db.mget(name1,name,age1))#获取多个值
# #hash 存key-value
db.hset(hash1,key1,value1)#添加
db.hset(hash1,key2,value2)
db.hset(hash1,key3,value3)
print(db.hget(hash1,key2))#获取hash中key对应的值
print(db.hgetall(hash1))#获取hash中所有的键值对
# list 存list
db.lpush(list1,1,2,3)#倒序插入先进后出
db.rpush(list2,2,3,4,5)#顺序插入先进先出
print(db.llen(list1))#list的长度
print(db.lrange(list1,0,-1))#lrange key start stop(-1 在 Redis 中是一个特殊的索引表示列表的最后一个元素)
#set
db.sadd(set1,55,66,77,55)
print(db.scard(set1))#scard获取set的长度
print(db.smembers(set1))#smembers获取set的所有元素
#zset
db.zadd(zset1, {zoe: 22, jodie: 11})#Redis 的 Sorted Set有序集合要求每个成员member都关联一个分数score
print(db.zcard(zset1))
print(db.zrange(zset1,0,-1,withscoresTrue))#[(jodie, 11.0), (zoe, 22.0)]2.3.2.使用scrapy_redis爬取数据到redis
import scrapy
import json
import re#正则匹配
from urllib import parse#url的编解码
from ..items import JdItem
from scrapy_redis.spiders import RedisSpider#1.使用redis分布式爬 #5.设置settings里面的配置
#6.redis里面添加key,并传递初始url:lpush jingdong https://gw-e.jd.com/client.action?callbackfuncbody%7B%22moduleType%22%3A1%2C%22page%22%3A1%2C%22pageSize%22%3A20%2C%22scopeType%22%3A1%7DfunctionIdbookRankcliente.jd.com_1732667213092
#7.启动分布式爬虫:scrapy crawl app
class AppSpider(RedisSpider):#2.继承RedisSpider# def __init__(self):# self.page1def __init__(self, *args, **kwargs):#4.分布式爬虫类的初始化domain kwargs.pop(domain, )self.allowed_domains filter(None, domain.split(,))super(AppSpider, self).__init__(*args, **kwargs)self.page1name app# allowed_domains [channel.jd.com]# start_urls [https://gw-e.jd.com/client.action?callbackfuncbody%7B%22moduleType%22%3A1%2C%22page%22%3A1%2C%22pageSize%22%3A20%2C%22scopeType%22%3A1%7DfunctionIdbookRankcliente.jd.com_1732667213092]redis_keyjingdong#3.设置rediskeydef parse(self, response):match re.search(rfunc\((\{.*\})\), response.text)json_str match.group(1) if match else Noneif json_str is not None:json_datajson.loads(json_str)#字符串转jsonobjJdItem()for book in json_data[data][books]:obj[title]book[bookName]obj[price]book[sellPrice]# print(title,price)yield objself.page1next_url {{moduleType:1,page:{page},pageSize:20,scopeType:1}}.format(pageself.page)#字符串---------编码next_urlhttps://gw-e.jd.com/client.action?callbackfuncbodyparse.quote(next_url)functionIdbookRankcliente.jd.com_1732667213092#编码---------符串print(next_url)yield scrapy.Request(urlnext_url,callbackself.parse,dont_filterTrue)settings.py
BOT_NAME jdSPIDER_MODULES [jd.spiders]
NEWSPIDER_MODULE jd.spidersUSER_AGENT Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15.7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
ROBOTSTXT_OBEY False
LOG_LEVEL ERROR
DUPEFILTER_CLASS scrapy_redis.dupefilter.RFPDupeFilter
SCHEDULER scrapy_redis.scheduler.Scheduler
SCHEDULER_PERSIST True
REDIS_URL redis://127.0.0.1:6379
DOWNLOAD_DELAY 1
ITEM_PIPELINES {jd.pipelines.JdPipeline: 300,#写自己的pipline文件地址scrapy_redis.pipelines.RedisPipeline: 400
}2.3.3.redis到mongondb
db_redis redis.Redis(hostlocalhost, port6379, decode_responsesTrue)
client_mongo pymongo.MongoClient(mongodb://localhost:27017)
db_mongo client_mongo[Redis2Mongo]
col_mongo db_mongo[C1]for i in db_redis.lrange(list2, 0, -1):page {# title: json.loads(i)[title]value:i}res col_mongo.insert_one(page)print(res.inserted_id)3.selenium模拟驱动
from selenium import webdriver#pip install selenium
import time
driver webdriver.Chrome(executable_path./_resources/chromedriver.exe)
driver.get(https://www.cheshi.com/)
print(driver.current_url)#url
with open(./cheshi.html,w, encodingutf-8) as f:f.write(driver.page_source)#源码
driver.save_screenshot(cheshi.png)#网页截图
time.sleep(1)
driver.quit()3.1orc识别数字字母验证码
#1.买云上ocr2.pytesseract
#pip install pytesseract pillow (https://digi.bib.uni-mannheim.de/tesseract/下载并添加到环境变量tesseract -v测试)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import time
import cv2 as cv
from PIL import Image
import pytesseract
import re
#1.首先通过xpath将网页验证码保存下来
driver webdriver.Chrome(executable_path./_resources/chromedriver.exe)
driver.get(https://service.cheshi.com/user/login.php)
time.sleep(1)
while driver.current_urlhttps://service.cheshi.com/user/login.php:#如果一直在登陆页面就是识别不正确再重来imgdriver.find_element(By.XPATH,//img[classyzm_img])img.screenshot(./captcha.png)time.sleep(1)# driver.quit()#2.对图片进行二值化和形态学操作和去噪点处理等操作img2cv.imread(./captcha.png,flagscv.IMREAD_GRAYSCALE)thresh,binarycv.threshold(img2,120,255,cv.THRESH_BINARY)# 噪点处理def interference_point(img):h, w img.shape[:2]# 遍历像素点进行处理for y in range(0, w):for x in range(0, h):# 去掉边框上的点if y 0 or y w - 1 or x 0 or x h - 1:img[x, y] 255continuecount 0if img[x, y - 1] 255:count 1if img[x, y 1] 255:count 1if img[x - 1, y] 255:count 1if img[x 1, y] 255:count 1if count 2:img[x, y] 255return img# kernel cv.getStructuringElement(cv.MORPH_RECT, (4, 4))# result cv.morphologyEx(binary, cv.MORPH_OPEN, kernelkernel)# gray cv.GaussianBlur(result, (5, 5), 0) # 高斯滤波# result cv.Canny(gray, 75, 250) # Canny边缘检测resultinterference_point(binary)cv.imwrite(./captcha2.png,result)#3.用Tesseract-OCR识别pytesseract.pytesseract.tesseract_cmd rD:\Softwares\Tesseract-OCR\tesseract.exetextImage Image.fromarray(result)text pytesseract.image_to_string(textImage)print(text)#4.对识别结果做进一步处理expre.compile([a-zA-Z0-9])outexp.findall(text)out .join([str(i) for i in out])print(The result:, out)#实现登录phonedriver.find_element(By.XPATH,//input[classphone])phone.clear()ActionChains(driver).pause(0.5).click(phone).send_keys(18811752638).perform()#0.5秒后点击phone的input元素然后填内容yzmdriver.find_element(By.XPATH,//input[idimgyzm])yzm.clear()ActionChains(driver).pause(0.5).click(yzm).send_keys(out).perform()fsyzmdriver.find_element(By.XPATH,//span[classsendyzm_btn blue])fsyzm.click()time.sleep(20)logindriver.find_element(By.XPATH,//input[namesub])login.click()time.sleep(4)
print(driver.page_source)
time.sleep(4)
driver.quit()
3.2.opencv模板匹配滑块验证码
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import cv2 as cv
from PIL import Image
import numpy
import requests
import re
#1.首先拿到滑块验证码图片
driver webdriver.Chrome(executable_path./_resources/chromedriver.exe)
driver.get(https://www.liepin.com/)
time.sleep(3)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, /html/body/section[2]/div[2]/div/div/div/div/div[2]/div/div[2]))
).click()
username WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, //*[idlogin]))
)
username.send_keys(18811752638)
password WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, //*[idpwd]))
)
password.send_keys(xxxx)
checkbox driver.find_element(By.CLASS_NAME, ant-checkbox-input)
checkbox.click()
login WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.XPATH, /html/body/section[2]/div[2]/div/div/div/div/div[3]/div/form/button))
)
login.click()
time.sleep(8)
#切换iframe再拿到验证码图片
driver.switch_to_frame(tcaptcha_iframe)
while driver.current_urlhttps://www.liepin.com/:#避免出错,出错重复尝试refresh driver.find_element(By.XPATH, //*[idreload]/div)refresh.click()time.sleep(2)back_urldriver.find_element(By.XPATH,//*[idslideBg]).get_attribute(src)backrequests.get(back_url)with open(./back.png,wb) as f:f.write(back.content)front_urldriver.find_element(By.XPATH,//*[idslideBlock]).get_attribute(src)frontrequests.get(front_url)with open(./front.png,wb) as f:f.write(front.content)#2.opencv计算滑动距离backcv.imread(./back.png,flagscv.IMREAD_GRAYSCALE)frontcv.imread(./front.png,flagscv.IMREAD_GRAYSCALE)frontfront[24:front.shape[0]-24,24:front.shape[1]-24]#小滑块图片裁剪处理一下thresh,backcv.threshold(back,110,255,cv.THRESH_BINARY)#图片二值化处理 thresh,frontcv.threshold(front,40,255,cv.THRESH_BINARY_INV)cv.imwrite(./back_p.png,back)cv.imwrite(./front_p.png,front)matchcv.matchTemplate(back,front,cv.TM_CCORR_NORMED)distancecv.minMaxLoc(match)[3][0]distancedistance*341//680-37#因为前端渲染的图片是经过压缩的所以这里也做等比例缩小-37是因为front左边有37# print(distance)#3.使用selenium模拟滑块滑动slider driver.find_element(By.XPATH, //*[idtcaptcha_drag_thumb])ActionChains(driver).pause(0.2).click_and_hold(slider).pause(0.2).move_by_offset(distance / 4, 5).perform()#避免被识别分三次滑动ActionChains(driver).pause(0.1).move_by_offset(distance / 2, -2).perform()ActionChains(driver).pause(0.1).move_by_offset(distance / 4, 3).release().perform()time.sleep(3)driver.get(https://www.liepin.com/career/golang/)
driver.quit()4.js逆向
首先使用网站https://curlconverter.com/ 可以自动编写requests请求代码右键复制以curlbah格式复制然后去网站进行粘贴生成
4.1.response混淆加密
加密方式很多一般response的内容使用AED和DES对称加密自定义加密等请求头和请求负载一般使用RAS非对称加密md5,Sha256和Base64等加密方式。 解密首先使用关键词encryptdecryptjson.parse,路径和返回参数名等打断点写js代码破解加密数据python请求调用js或者启动器直接进入源代码。启动器里面有Promis.then就是异步ajax请求搜索json.parse 一般
4.1.1.js解密
使用js解密需要安装库然后在python里面调用js代码也要安装python包
npm install crypto-js
npm install jsdom
npm install base64-js
import execjs#pip install pyexecjs2
ctxexecjs.compile(open(./3zhaobiao.js,r,encodingutf-8).read()).call(decryptByDES,response.json())#调用这个js里面的decryptByDES函数并给这个函数传递后面的参数这里是response.json()4.1.1.1.AES
有ECB和CBC两种模式在 ECB 和 CBC 模式下如果明文长度不是 8 字节的整数倍需要进行填充填充后密文长度是 8 字节的整数倍。如果明文长度是 8 字节的整数倍密文长度等于明文长度在没有添加额外认证信息等情况下。
ECB
ECB 模式(Electronic Codebook) 模式这是一种比较简单的 AES 模式。ECB 模式在加密时将每个块独立加密没有使用初始化向量 (IV)。在解密过程中使用了 unpad 方法来去除填充。由于 AES 的块大小为 16 字节所以使用 unpad(info, 16) 去除多余的填充字节。
const CryptoJSrequire(crypto-js)
function a(e, t) {var n CryptoJS.enc.Utf8.parse(t )//|| 46cc793c53dc451b)return CryptoJS.AES.decrypt(e, n, {mode: CryptoJS.mode.ECB,padding: CryptoJS.pad.Pkcs7}).toString(CryptoJS.enc.Utf8)}
data1GsGEjP8dAjIYaDNgCxOkJYJQrOECQf8iB5dv6yI
data JSON.parse(a(data1,efabccee-b754-4c))
console.log(data)CBC
CBC 模式 (Cipher Block Chaining) 模式在加密时使用了 初始化向量 (IV)并且每个数据块的加密结果会影响后续的数据块因此相同的明文在不同的 IV 下加密得到的密文是不同的。使用 unpad 去除填充确保解密后的数据符合预期。
const CryptoJSrequire(crypto-js)
function b(t) {var e CryptoJS.enc.Utf8.parse(EB444973714E4A40876CE66BE45D5930), n CryptoJS.enc.Utf8.parse(B5A8904209931867), a CryptoJS.AES.decrypt(t, e, {iv: n,mode: CryptoJS.mode.CBC,padding: CryptoJS.pad.Pkcs7});return a.toString(CryptoJS.enc.Utf8)
}
tMZphJmFlelDpw2aSCfdFb5yXTSQuTw
console.log(b(t))4.1.1.2.DES
一般情况下使用 DES 加密后的密文长度是 8 字节的整数倍且不超过 64 位的整数倍因为分组长度是 64 位。
参数含义iv16进制初始向量需要获取mode加密模式一般写死padding填充方法一般写死return ciphertext.toString密文以16进制字符串返回
const CryptoJSrequire(crypto-js)
function b(t) {var e CryptoJS.enc.Utf8.parse(EB444973714E4A40876CE66BE45D5930), n CryptoJS.enc.Utf8.parse(B5A8904209931867), a CryptoJS.AES.decrypt(t, e, {iv: n,mode: CryptoJS.mode.CBC,padding: CryptoJS.pad.Pkcs7});return a.toString(CryptoJS.enc.Utf8)
}4.1.1.3.RSA
非对称加解密算法公钥加密私钥解密,同样数据加密结果不一样。密文长度等于密钥长度。
//登录接口的account和password都是经过加密的登录按钮的点击函数全局搜索或者click事件进入函数查看函数搜加密注释有
window global;
const JSEncryptrequire(jsencrypt)//npm install jsencrypt
global.window {};
function et(_0x32033c) {var _0x283d00 MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAsgDq4OqxuEisnk2F0EJFmw4xKa5IrcqEYHvqxPs2CHEg2kolhfWA2SjNuGAHxyDDE5MLtOvzuXjBx/5YJtc9zj2xR/0moesSVi/xtG1tkVaTCbaTVY5C61iyr3FGqrKOD4/XECu0Xky1W9ZmmaFADmZi76gO9wjgVpU9aLcBcw/loHOeJrCqjp7pA98hRJRYMML8MK15mnC4ebooOvamJlstW6t/1lghR8WNV8cocxgcHHuXBxgns2MlACQbSdJ8c6Z3RQeRZBzyjfey6JCCfbEKouVrWIUuPphBL3OANfgp0BQG31bapvePTfXU48TYK0M5kE8LgbbWQIDAQAB;var _0x1defd6 new JSEncrypt();_0x1defd6[setPublicKey](_0x283d00);var _0x4bd6d3 _0x1defd6[encrypt](_0x32033c);return _0x4bd6d3;
}
console.log(et(18811752638))4.1.1.4.MD5
md5加密一般是16进制数字字母不带符号32个字符同一个参数加密结果一样不可逆只可爆破 载荷数据传递的是json数据也就是request的时候json参数数据 表单数据传递的是字典也就是request的时候data参数数据提交表单的时候使用
//生成请求头参数portal-sign
const CryptoJSrequire(crypto)
e{ts: (new Date).getTime(),type: 12,IS_IMPORT: 1,pageSize: 3
}
function l(t, e) {return t.toString().toUpperCase() e.toString().toUpperCase() ? 1 : t.toString().toUpperCase() e.toString().toUpperCase() ? 0 : -1
}
function u(t) {for (var e Object.keys(t).sort(l), n , a 0; a e.length; a)if (void 0 ! t[e[a]])if (t[e[a]] t[e[a]]instanceof Object || t[e[a]]instanceof Array) {var i JSON.stringify(t[e[a]]);n e[a] i} elsen e[a] t[e[a]];return n
}
function s(text) {return CryptoJS.createHash(md5).update(text).digest(hex);}
function d(t) {for (var e in t) ! t[e] void 0 ! t[e] || delete t[e];var n B3978D054A72A7002063637CCDF6B2E5 u(t);return s(n).toLocaleLowerCase()
}
console.log(d(e)) 4.1.1.5.sha256加密
sha256加密长度64位字符数字字母不带符号
const CryptoJS require(crypto-js);const hash CryptoJS.SHA256(hello world).toString(CryptoJS.enc.Hex);
console.log(hash);4.1.1.6.Base64加密
长度72位有可能会是数字字母最后有
//登录接口的account和password都是经过加密的登录按钮的点击函数全局搜索或者click事件进入函数查看函数搜加密注释有
window global;
const JSEncryptrequire(jsencrypt)//npm install jsencrypt
global.window {};
function et(_0x32033c) {var _0x283d00 MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAsgDq4OqxuEisnk2F0EJFmw4xKa5IrcqEYHvqxPs2CHEg2kolhfWA2SjNuGAHxyDDE5MLtOvzuXjBx/5YJtc9zj2xR/0moesSVi/xtG1tkVaTCbaTVY5C61iyr3FGqrKOD4/XECu0Xky1W9ZmmaFADmZi76gO9wjgVpU9aLcBcw/loHOeJrCqjp7pA98hRJRYMML8MK15mnC4ebooOvamJlstW6t/1lghR8WNV8cocxgcHHuXBxgns2MlACQbSdJ8c6Z3RQeRZBzyjfey6JCCfbEKouVrWIUuPphBL3OANfgp0BQG31bapvePTfXU48TYK0M5kE8LgbbWQIDAQAB;var _0x1defd6 new JSEncrypt();_0x1defd6[setPublicKey](_0x283d00);var _0x4bd6d3 _0x1defd6[encrypt](_0x32033c);return _0x4bd6d3;
}
console.log(et(18811752638))4.1.2.python解密
4.1.2.1.AES
ECB
data1response.json()[data1]
def AES_decrypt(data1):html base64.b64decode(data1)key b40w42rjLEXxYhxRnaes AES.new(key, AES.MODE_ECB)info aes.decrypt(html)decrypt_data unpad(info, 16).decode()return decrypt_data
data_outAES_decrypt(data1)
print(data_out)CBC
def decrypt_aes_cbc(ciphertext_base64, key, iv):ciphertext base64.b64decode(ciphertext_base64)# 将 base64 编码的密文解码为字节cipher AES.new(key, AES.MODE_CBC, iv)# 创建 AES 解密器使用 CBC 模式和给定的 IVdecrypted_data unpad(cipher.decrypt(ciphertext), AES.block_size)# 解密并去除填充return decrypted_data.decode(utf-8)# 将解密后的字节转为字符串假设是 UTF-8 编码
key EB444973714E4A40876CE66BE45D5930 # 替换为实际密钥
iv B5A8904209931867 # 替换为实际 IV(16位)
decrypted_message decrypt_aes_cbc(response.json()[Data],key.encode(utf-8), iv.encode(utf-8))# 解密
print(Decrypted message:, decrypted_message)4.1.2.2.DES
import requests
from Crypto.Cipher import DES
from Crypto.Util.Padding import unpad
import base64def decrypt_by_des(ciphertext_base64, key):# 解码 Base64 密文ciphertext base64.b64decode(ciphertext_base64)# 创建 DES 解密对象使用 ECB 模式填充方式为 PKCS7cipher DES.new(key.encode(utf-8), DES.MODE_ECB)# 解密并去除填充decrypted_bytes unpad(cipher.decrypt(ciphertext), DES.block_size)# 将解密后的字节转回字符串return decrypted_bytes.decode(utf-8)4.1.2.3.Base64
import math
import time
import base64
time1 math.floor(time.time() * 1000)
mcode base64.b64encode(str(time1).encode()).decode()
print(mcode)4.2.hook注入反debug var AAA Function.prototype.constructor;
Function.prototype.constructor function(x) {if (x! debugger) {return AAA(x);}return function() {};
};