唐山做网站公司汉狮价格,怎么申请网页,asp网站部署 iis7,建设网站时新闻资讯python爬虫抓取新闻并且植入自己的mysql远程数据库内#xff01;这个代码是我自己写了很久才写好的#xff0c;分享给大家。喜欢的点个赞。 # -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
import datetime
import randomimport pymysql
from selenium im…python爬虫抓取新闻并且植入自己的mysql远程数据库内这个代码是我自己写了很久才写好的分享给大家。喜欢的点个赞。 # -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
import datetime
import randomimport pymysql
from selenium import webdriver
from lxml import etree
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import Bydef strreplace_v1(old_str, key, value):# 替换某个字符串的一个或某几个字符串new_str old_str.replace(key, value)return new_strdef get_page_source_html(driver, urlinfo):driver.get(urlinfo)page_text driver.page_sourcetree etree.HTML(page_text)return treedef get_page_source_etree(driver):page_text driver.page_sourcetree etree.HTML(page_text)return treedef get_list_a(etree, xpathinfo):return etree.xpath(xpathinfo)def get_news_title(etree, xpathino):return etree.xpath(xpathino)def get_news_content(etree, xpathino):return etree.xpath(xpathino)def get_news_publish(etree, xpathino):return etree.xpath(xpathino)def getUA():uaList [# 360Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36,# chromeMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36,# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36,Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36,# firefox# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0,Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0,# ie11# Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko,# ie8# Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357),# 2345王牌# Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018,# 搜狗# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0,# operaMozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60]headers random.choice(uaList)return headersdef get_desurl_list():# 所有的目标url集合。500个。urlinfo_list []tree ET.parse(sitemap.xml)url tree.find(url)for rank in tree.iter(loc):urlinfo_list.append(rank.text)return urlinfo_listdef createwailian(urlwllist, urlzhiru):str strreplace_v1(urlzhiru, hrefinfo, urlwllist[random.randint(0, len(urlwllist))])return strif __name__ __main__:allwlurllist get_desurl_list()options Options()options.add_argument(--disable-desktop-notifications)options.add_argument(User-Agent%s % getUA())# options.add_argument(--proxy-server{0}.format(103.37.141.69:80))# 创建浏览器对象driver webdriver.Chrome(optionsoptions)urlend urlbegin http://www.106ms.com/index.php?list6-for urlstart in range(1, 10):print(当前正访问:{0}.format(urlbegin str(urlstart)))driver.get(urlbegin str(urlstart))# //*[idmenu-item-10]/a# 点击SEO基础知识链接# driver.find_element(By.XPATH, value//*[idmenu-item-5]/a).click()# 存储a标签的集合list_a []# xpath获取到的集合是一个存储了大量的webelment对象的集合想具体拿到属性信息还得再写标签自身的xpath语法。# //*[idmoar]/section[2]/div/div[1]/div[1]/dl/dd/h3/aa_list get_list_a(get_page_source_etree(driver), //*[idmoar]/section[2]/div/div/div/dl/dd/h3/a)sleep(1)for a in a_list:href a.xpath(./href)[0]list_a.append(href)print(当前页面获取a标签集合长度为{0}.format(len(list_a)))sleep(1)# 遍历当前list_atry:db pymysql.Connect(host8.142.*.*, # 服务器ip地址port3306, # mysql默认端口号user106iiaa, # 用户名passwordyrdsrootadmi3, # 密码charsetutf8, # 字符集dbhbdsa89aa # 数据库)cursor db.cursor()#xuanyan p本平台所发布的部分公开信息来源于互联网转载的目的在于传递更多信息及用于网络分享并不代表本站赞同其观点,本平台所提供的信息只供参考之用。不保证信息的准确性、有效性、及时性和完整性。如有侵权请联系:[14878741214]删除谢谢合作/p#urlzhiru p网站入口1:a hrefhrefinfo target_blank网站入口地址/a/pp网站入口2:a hrefhttp://diyigefan.com/ target_blankdiyigefan.com/a/preadnumber 0base_url http://www.106ms.comfor newsurl in list_a:try:wanquan base_url newsurl# print(wanquan)driver.get(wanquan)# title driver.find_element(By.CLASS_NAME, valuearticle-title).text# content driver.find_element(By.CLASS_NAME, valuearticle-content).text# newscontent driver.find_element(By.XPATH, value//div[classnews_txt])#newscontentfordes driver.find_element(By.CLASS_NAME, valuenews_txt).text# /html/head/meta[6]newscontentfordes driver.find_element(By.XPATH, value/html/head/meta[6])# 使用beautifulsoup封装html源码信息然后开始提取内容。soup BeautifulSoup(driver.page_source, featureslxml, from_encodingutf-8)for s in soup(img):s.extract()for s in soup(a):s.extract()# 已经提前清除了img标签和a标签了allp soup.find(article, {class: content text-left}).findAll(div)paragraphs []for x in allp:paragraphs.append(str(x))# 去掉最后一个元素的值。content2 .join(paragraphs[0:-1])# print(content2)# 标题信息title driver.find_element(By.XPATH, value//*[idmoar]/section[2]/div/div/div/article/header/h2).textkeywords title# content content2 createwailian(allwlurllist, urlzhiru)content content2# content newscontent xuanyandes str(newscontentfordes.strip())[0:120]sql insert into news (title, keywords, des,content, author,publish,click,state,attr,attrdiy,flag,cate,uid) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)values (title, keywords, des, content, admin,str(datetime.datetime.now().strftime(%Y-%m-%d %H:%M:%S)),random.randint(100, 999),0, 0, 0, 画室新闻, 5, 3)cursor.execute(sql, values)db.commit()readnumber readnumber 1print(标题:{0}---插入数据库成功.format(title))except Exception as ee:print(发生了异常, ee)continueexcept Exception as e:# print(发生了异常, e)db.rollback()finally:cursor.close()db.close()sleep(5)print(本次任务成功植入{0}篇软文..format(readnumber))sleep(600)driver.quit()前提是你需要提前安装好里面使用到了一些插件包。
否则是会报错的。