# 导入需要的库 import requests from lxml import etree import pymysql import time import ucms import os # 文章详情信息类 posturl='https://xxxx.net/et_post/' class articleData(): def __init__(self, title, url, content): self.title = title #文章名称 self.url = url #文章摘要 self.content = content #文章路径 def to_string(self): print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content) def text_create(name, msg): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'w') file.write(msg) file.close() def text_read(name): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'r') msg=file.readline() print(msg) return msg file.close() #保存狗狗详情数据 #保存数据 def saveData(title,url,content): count = pymysql.connect( host='127.0.0.1', # 数据库地址 port=3306, # 数据库端口 user='root', # 数据库账号 password='usbw', # 数据库密码 db='test' # 数据库名 ) # 创建数据库对象 db = count.cursor() # 写入sql # print("写入数据:"+DataObject.to_string()) sql = f"insert into info(title,url,content) " \ f"values ('{title}','{url}','{content}')" # 执行sql # print(sql) db.execute(sql) # 保存修改内容 count.commit() db.close() # 爬取数据的方向 def getWebData(): page=text_read('mytxtfile') for i in range (int(page),60000): # 网站页面路径 url = "http://www.2itcn.com/news/p_%s.html" % i # 请求头,模拟浏览器请求 header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36" } # 获取页面所有节点代码 html = requests.get(url, headers=header) html.close() # 打印页面代码查看 # print(html.text) # 如果乱码可以设置编码格式 html.encoding = 'UTF-8' # 通过xpath获取数据对应节点 etreeHtml = etree.HTML(html.text) # # ''.join()是将内容转换为字符串可以后面接replace数据进行处理 # title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题 # abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要 url = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/@href')#文章路径 # content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()') for j in range(len(url)): try: html = requests.get('http://www.2itcn.com/'+url[j], headers=header) html.encoding = 'UTF-8' etreeHtml = etree.HTML(html.text) title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/h1/text()')#文章标题 img = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/div[2]/ul/li/a[1]/img/@scr')#文章路径 content = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/p[1]/text()')#文章路径 data={} data['cid']=2 data['title']=title data['content']=content r=ucms.post(posturl,data) print(j,title) except: pass time.sleep(0.7) text_create('mytxtfile',str(i)) if __name__ == "__main__": while 1: try: getWebData() except: time.sleep(60) pass
添加了:处理错误的能力
欢迎联系本站长QQ:3216572
最新回复 (6)
-
admin 2022-10-222楼
import requests import re def post(url,data): callback = {} try: r = requests.post(url, data) result = r.text # print(result) if result == '[ok]': callback['code'] = 1 callback['msg'] = '推送成功' return callback elif result == 'no cid': callback['code'] = 2 callback['msg'] = '栏目cid参数不能为空' return callback elif result == 'channel not exist': callback['code'] = 3 callback['msg'] = '栏目不存在' return callback elif ('field list' in result) == True: txt = re.search(r'\'(.*?)\'', result) callback['code'] = 4 callback['msg'] = '栏目字段 ' + str(txt.group(1)) + ' 不存在' return callback elif ('404 Not Found' in result) == True: callback['code'] = 5 callback['msg'] = '采集接口网址错误' return callback else: callback['code'] = 6 callback['msg'] = '网络错误或其它未知错误' return callback except: callback['code'] = 7 callback['msg'] = '网址无法访问' return callback def post_list(url,dataarry): callback = {} a =0 for data in dataarry: r=post(url,data) if r['code']==1: a=a+1 callback['code']=1 callback['successnum']=a return callback
ucms 的代码
-
admin 2022-10-223楼
<?php if(!defined('ClassCms')) {exit();} class editortools { function route(){ $routes=array(); $routes[]=array('hash'=>'post','uri'=>'/et_post/','function'=>'post','enabled'=>1); Return $routes; } function post() { if(!count($_POST)) { echo('error'); Return ; } $article=$_POST; foreach ($article as $key => $value) { if(!is_hash($key)){ unset($article[$key]); } } if(!isset($article['cid']) && isset($_GET['cid'])) { $article['cid']=$_GET['cid']; } if(!isset($article['cid'])) { echo('no cid'); Return ; } if(!$channel=C('cms:channel:get',$article['cid'])) { echo('no channel'); Return ; } if(config('login')) { $userid=C('admin:nowUser'); if(!$userid) { echo('no login'); Return ; } if(config('powercheck')) { $channel=C('admin:article:channelGet',$article['cid']); if(!C('admin:check',C('cms:module:authStr',$channel['_module'],'add'))) { echo('no power'); Return ; } } } if(config('repostcheck') && isset($article['title'])) { $article_query=array('cid'=>$article['cid'],'where'=>array('title'=>$article['title'])); if(C('cms:article:getOne',$article_query)) { echo('repost'); Return ; } } $id=C('cms:article:add',$article); if(is_numeric($id)) { echo('[ok]'); }else { echo('insert error'); } Return true; } function config() { $configs=array(); $configs[]=array('configname'=>'登入','hash'=>'login','inputhash'=>'switch','tips'=>'关闭后,不需要登入即可通过接口发布文章,请注意安全问题!!!默认使用模拟发布,请在发布前获取网站cookie','tabname'=>'','defaultvalue'=>1); $configs[]=array('configname'=>'权限判断','hash'=>'powercheck','inputhash'=>'switch','tips'=>'登入后,判断相应登入用户是否有对应栏目的发布权限','tabname'=>'','defaultvalue'=>1); $configs[]=array('configname'=>'判断重复','hash'=>'repostcheck','inputhash'=>'switch','tips'=>'判断标题字段(title)是否重复,如果重复则发布失败','tabname'=>'','defaultvalue'=>0); Return $configs; } }
classcms post 文件
-
admin 2022-10-224楼
# 导入需要的库 import requests from lxml import etree import pymysql import time import ucms # 文章详情信息类 posturl='https://xxxx.net/et_post/' class articleData(): def __init__(self, title, url, content): self.title = title #文章名称 self.url = url #文章摘要 self.content = content #文章路径 def to_string(self): print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content) #保存狗狗详情数据 #保存数据 def saveData(title,url,content): count = pymysql.connect( host='127.0.0.1', # 数据库地址 port=3306, # 数据库端口 user='root', # 数据库账号 password='usbw', # 数据库密码 db='test' # 数据库名 ) # 创建数据库对象 db = count.cursor() # 写入sql # print("写入数据:"+DataObject.to_string()) sql = f"insert into info(title,url,content) " \ f"values ('{title}','{url}','{content}')" # 执行sql # print(sql) db.execute(sql) # 保存修改内容 count.commit() db.close() # 爬取数据的方向 def getWebData(): for i in range (2,10): # 网站页面路径 url = "http://m.mingyihui.net/s/sitemap_new_%s.html" % i # 请求头,模拟浏览器请求 header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36" } # 获取页面所有节点代码 html = requests.get(url, headers=header) # 打印页面代码查看 # print(html.text) # 如果乱码可以设置编码格式 html.encoding = 'UTF-8' # 通过xpath获取数据对应节点 etreeHtml = etree.HTML(html.text) # # ''.join()是将内容转换为字符串可以后面接replace数据进行处理 title = etreeHtml.xpath('/html/body/div[1]/div/div[1]/ul//a/text()')#文章标题 # abstract = ''.join(_.xpath('./article/a/div[2]/text()'))#文章摘要 url = etreeHtml.xpath('/html/body/div[1]/div/div[1]/ul//a/@href')#文章路径 content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()') for j in range(len(url)): try: # saveData(title[j],url[j],j) # print(title[j],url[j],j) data={} data['cid']=5 data['title']=title[j] data['content']=title[j] # r=ucms.post(posturl,data) print(j,title[j]) # print(r['code']) # print(r['msg']) except: pass time.sleep(0.7) if __name__ == "__main__": getWebData()
-
admin 2022-10-225楼
# 导入需要的库 import requests from lxml import etree import pymysql import time import ucms from lxml import html posturl='https://xxxx.net/et_post/' class articleData(): def __init__(self, title, url, content): self.title = title #文章名称 self.url = url #文章摘要 self.content = content #文章路径 def to_string(self): print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content) def saveData(title,url,content): count = pymysql.connect( host='127.0.0.1', # 数据库地址 port=3306, # 数据库端口 user='root', # 数据库账号 password='usbw', # 数据库密码 db='test' # 数据库名 ) # 创建数据库对象 db = count.cursor() # 写入sql # print("写入数据:"+DataObject.to_string()) sql = f"insert into info(title,url,content) " \ f"values ('{title}','{url}','{content}')" # 执行sql # print(sql) db.execute(sql) # 保存修改内容 count.commit() db.close() def text_create(name, msg): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'w') file.write(msg) file.close() # 爬取数据的方向 def getWebData(): for i in range (2,379): url = "http://xhzy06.com/vodtype/20-%s.html" % i header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36" } html = requests.get(url, headers=header) html.encoding = 'UTF-8' etreeHtml = etree.HTML(html.text) # # ''.join()是将内容转换为字符串可以后面接replace数据进行处理 # title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题 # abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要 url = etreeHtml.xpath('/html/body/div[2]/div[2]/ul//a[1]/@href')#文章路径 # content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()') for j in range(len(url)): try: html = requests.get('https://xhzy01.com/'+url[j], headers=header) html.encoding = 'UTF-8' etreeHtml = etree.HTML(html.text) title = etreeHtml.xpath('/html/body/div[3]/h1/text()')#文章标题 img = etreeHtml.xpath('/html/body/div[3]/div/div[1]/img/@src')#文章路径 info = etreeHtml.xpath('//*[@id="content"]/div/p/text()')#文章路径 mp4 = etreeHtml.xpath('//*[@id="kyxhm3u80"]/text()')#文章路径 x = mp4[0].split("$", 1) content = etreeHtml.xpath('/html/body/div[3]/div/div[2]/p[4]/text()')#文章路径 #转为strin data={} data['cid']=8 data['title']=title[0] data['pic']=img[0] data['content']=content[0]+info[0]+"<br><iframe src='//mv.lmxx.net/player/m3u8.php?url="+x[1]+"' scrolling='0' frameborder='0' width='100%' height='380' allowfullscreen='allowfullscreen' mozallowfullscreen='mozallowfullscreen' msallowfullscreen='msallowfullscreen' oallowfullscreen='oallowfullscreen' webkitallowfullscreen='webkitallowfullscreen'></iframe><br>电影封面<br><img src="+img[0]+" >" r=ucms.post(posturl,data) print(j,title[0],x[1]) # text_create('mytxt',str(content)) except: pass time.sleep(7) if __name__ == "__main__": getWebData()
采集电影的
-
admin 2022-10-236楼
# 导入需要的库 import requests from lxml import etree import pymysql import time import ucms import os # 文章详情信息类 posturl='https://xxxx.net/et_post/' class articleData(): def __init__(self, title, url, content): self.title = title #文章名称 self.url = url #文章摘要 self.content = content #文章路径 def to_string(self): print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content) def text_create(name, msg): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'w') file.write(msg) file.close() def text_read(name): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'r') msg=file.readline() print(msg) return msg file.close() #保存狗狗详情数据 #保存数据 def saveData(title,url,content): count = pymysql.connect( host='127.0.0.1', # 数据库地址 port=3306, # 数据库端口 user='root', # 数据库账号 password='usbw', # 数据库密码 db='test' # 数据库名 ) # 创建数据库对象 db = count.cursor() # 写入sql # print("写入数据:"+DataObject.to_string()) sql = f"insert into info(title,url,content) " \ f"values ('{title}','{url}','{content}')" # 执行sql # print(sql) db.execute(sql) # 保存修改内容 count.commit() db.close() # 爬取数据的方向 def getWebData(): page=text_read('mytxtfile') for i in range (int(page),60000): print("jingruxunhuan") # 网站页面路径 url = "http://www.2itcn.com/news/p_%s.html" % i # 请求头,模拟浏览器请求 header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36" } # 获取页面所有节点代码 html = requests.get(url, headers=header) html.close() # 打印页面代码查看 # print(html.text) # 如果乱码可以设置编码格式 html.encoding = 'UTF-8' # 通过xpath获取数据对应节点 etreeHtml = etree.HTML(html.text) # # ''.join()是将内容转换为字符串可以后面接replace数据进行处理 # title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题 # abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要 url = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/@href')#文章路径 # content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()') for j in range(len(url)): try: html = requests.get('http://m.2itcn.com/'+url[j], headers=header) html.encoding = 'UTF-8' etreeHtml = etree.HTML(html.text) title = etreeHtml.xpath('//*[@id="article"]//h1/text()')#文章标题 img = etreeHtml.xpath('//*[@id="soft-info"]/img/@src')#文章路径 content = etreeHtml.xpath('//*[@id="detailse"]/p/text()')#文章路径 content1 = ''.join(i for i in content) # print(content1) # time.sleep(3) data={} data['cid']=102 data['img']='https://img-xc.oss-cn-beijing.aliyuncs.com'+img[0] data['title']=title data['content']=content1 r=ucms.post(posturl,data) print(j,r,title) except: print("chucuo") pass text_create('mytxtfile',str(i+1)) if __name__ == "__main__": while 1: try: getWebData() except: time.sleep(60) pass
-
admin 2022-10-257楼
import requests from lxml import etree import pymysql import time import ucms import os # 文章详情信息类 posturl='https://xxxx.net/et_post/' def text_create(name, msg): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'w') file.write(msg) file.close() def text_read(name): desktop_path = "./" # 新创建的txt文件的存放路径 full_path = desktop_path + name + '.txt' # 也可以创建一个.doc的word文档 file = open(full_path, 'r') msg=file.readline() print(msg) return msg file.close() def getWebData(): page=text_read('myh') for i in range (int(page),60000): url = "http://m.mingyihui.net/doctor_%s/index.html" % i # url = "http://m.mingyihui.net/doctor_1646844/index.html" # 请求头,模拟浏览器请求 # print(url) header = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36" } # print('123') # 获取页面所有节点代码 html = requests.get(url, headers=header) html.close() html.encoding = 'UTF-8' # 打印页面代码查看 etreeHtml = etree.HTML(html.text) img = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[1]/img/@src')#文章路径 name = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[1]/span[1]/text()')#文章路径 jibie = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[2]/span[1]/text()')#文章路径 keshi = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[2]/span[2]/a/text()')#文章路径 yiyuanurl = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[3]/a/@href')#文章路径 yiyuanname = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[3]/a/text()')#文章路径 shanchang = etreeHtml.xpath('/html/body//div[2]/div[1]/div[5]/div/text()')#文章路径 jingli = etreeHtml.xpath('/html/body//div[2]/div[1]/div[6]/div[1]/text()')#文章路径 print(img,name,jibie,keshi,yiyuanurl,yiyuanname) text_create('myh',str(i+1)) data={} data['cid']=101 data['title']=name data['content']=jingli data['pic']=img data['type']=jibie data['room']=keshi data['hospital']=yiyuanname data['export']=shanchang data['ago']=yiyuanurl if not name[0] is None: r=ucms.post(posturl,data) print(i,r,title) if __name__ == "__main__": while 1: try: getWebData() except: time.sleep(0) pass