# 导入需要的库
import requests
from lxml import etree
import pymysql
import time
import ucms
import os
# 文章详情信息类
posturl='https://xxxx.net/et_post/'
class articleData():
def __init__(self, title, url, content):
self.title = title #文章名称
self.url = url #文章摘要
self.content = content #文章路径
def to_string(self):
print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content)
def text_create(name, msg):
desktop_path = "./" # 新创建的txt文件的存放路径
full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档
file = open(full_path, 'w')
file.write(msg)
file.close()
def text_read(name):
desktop_path = "./" # 新创建的txt文件的存放路径
full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档
file = open(full_path, 'r')
msg=file.readline()
print(msg)
return msg
file.close()
#保存狗狗详情数据
#保存数据
def saveData(title,url,content):
count = pymysql.connect(
host='127.0.0.1', # 数据库地址
port=3306, # 数据库端口
user='root', # 数据库账号
password='usbw', # 数据库密码
db='test' # 数据库名
)
# 创建数据库对象
db = count.cursor()
# 写入sql
# print("写入数据:"+DataObject.to_string())
sql = f"insert into info(title,url,content) " \
f"values ('{title}','{url}','{content}')"
# 执行sql
# print(sql)
db.execute(sql)
# 保存修改内容
count.commit()
db.close()
# 爬取数据的方向
def getWebData():
page=text_read('mytxtfile')
for i in range (int(page),60000):
# 网站页面路径
url = "http://www.2itcn.com/news/p_%s.html" % i
# 请求头,模拟浏览器请求
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
}
# 获取页面所有节点代码
html = requests.get(url, headers=header)
html.close()
# 打印页面代码查看
# print(html.text)
# 如果乱码可以设置编码格式
html.encoding = 'UTF-8'
# 通过xpath获取数据对应节点
etreeHtml = etree.HTML(html.text)
# # ''.join()是将内容转换为字符串可以后面接replace数据进行处理
# title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题
# abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要
url = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/@href')#文章路径
# content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()')
for j in range(len(url)):
try:
html = requests.get('http://www.2itcn.com/'+url[j], headers=header)
html.encoding = 'UTF-8'
etreeHtml = etree.HTML(html.text)
title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/h1/text()')#文章标题
img = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/div[2]/ul/li/a[1]/img/@scr')#文章路径
content = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/p[1]/text()')#文章路径
data={}
data['cid']=2
data['title']=title
data['content']=content
r=ucms.post(posturl,data)
print(j,title)
except:
pass
time.sleep(0.7)
text_create('mytxtfile',str(i))
if __name__ == "__main__":
while 1:
try:
getWebData()
except:
time.sleep(60)
pass
添加了:处理错误的能力
欢迎联系本站长QQ:3216572