小说采集并发布到ucms的python代码

admin 2022-10-22 277

# 导入需要的库
import requests
from lxml import etree
import pymysql
import time
import ucms
import os
# 文章详情信息类
posturl='https://xxxx.net/et_post/'
class articleData():
    def __init__(self, title, url, content):
        self.title = title #文章名称
        self.url = url #文章摘要
        self.content = content #文章路径
        
    def to_string(self):
        print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content)
        
def text_create(name, msg):
    desktop_path = "./"  # 新创建的txt文件的存放路径
    full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
    file = open(full_path, 'w')
    file.write(msg)
    file.close()
    
def text_read(name):
    desktop_path = "./"  # 新创建的txt文件的存放路径
    full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
    file = open(full_path, 'r')
    msg=file.readline()
    print(msg)
    return msg
    file.close()
  
  
#保存狗狗详情数据
#保存数据
def saveData(title,url,content):
    count = pymysql.connect(
        host='127.0.0.1',  # 数据库地址
        port=3306,  # 数据库端口
        user='root',  # 数据库账号
        password='usbw',  # 数据库密码
        db='test'  # 数据库名
    )
    # 创建数据库对象
    db = count.cursor()
    # 写入sql
    # print("写入数据:"+DataObject.to_string())
    sql = f"insert into info(title,url,content) " \
          f"values ('{title}','{url}','{content}')"
    # 执行sql
#     print(sql)
    db.execute(sql)
    # 保存修改内容
    count.commit()
    db.close()
# 爬取数据的方向
def getWebData():
    page=text_read('mytxtfile')
    for i in range (int(page),60000):
                # 网站页面路径
        url = "http://www.2itcn.com/news/p_%s.html" % i
        # 请求头,模拟浏览器请求
        header = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
        }
        
        # 获取页面所有节点代码
        html = requests.get(url, headers=header)
        html.close()
        # 打印页面代码查看
        # print(html.text)
        # 如果乱码可以设置编码格式
        html.encoding = 'UTF-8'
        # 通过xpath获取数据对应节点
        etreeHtml = etree.HTML(html.text)
#         # ''.join()是将内容转换为字符串可以后面接replace数据进行处理
#         title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题
#         abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要
        url = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/@href')#文章路径
#         content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()')
        for j in range(len(url)):
             try:
                 html = requests.get('http://www.2itcn.com/'+url[j], headers=header)
                 html.encoding = 'UTF-8'
                 etreeHtml = etree.HTML(html.text)
                 title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/h1/text()')#文章标题
                 img = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/div[2]/ul/li/a[1]/img/@scr')#文章路径
                 content = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div[1]/p[1]/text()')#文章路径
                 data={}
                 data['cid']=2
                 data['title']=title
                 data['content']=content
                 r=ucms.post(posturl,data)
                 print(j,title)
             except:
                 pass
        time.sleep(0.7)
        text_create('mytxtfile',str(i))
if __name__ == "__main__":
    while 1:
        try:
            getWebData()
        except:
            time.sleep(60)
            pass

添加了:处理错误的能力

欢迎联系本站长QQ:3216572
最新回复 (6)
  • admin 2022-10-22
    2
    import requests
    import re
    def post(url,data):
        callback = {}
        try:
            r = requests.post(url, data)
            result = r.text
    #         print(result)
            if result == '[ok]':
                callback['code'] = 1
                callback['msg'] = '推送成功'
                return callback
            elif result == 'no cid':
                callback['code'] = 2
                callback['msg'] = '栏目cid参数不能为空'
                return callback
            elif result == 'channel not exist':
                callback['code'] = 3
                callback['msg'] = '栏目不存在'
                return callback
            elif ('field list' in result) == True:
                txt = re.search(r'\'(.*?)\'', result)
                callback['code'] = 4
                callback['msg'] = '栏目字段 ' + str(txt.group(1)) + ' 不存在'
                return callback
            elif ('404 Not Found' in result) == True:
                callback['code'] = 5
                callback['msg'] = '采集接口网址错误'
                return callback
            else:
                callback['code'] = 6
                callback['msg'] = '网络错误或其它未知错误'
                return callback
        except:
            callback['code'] = 7
            callback['msg'] = '网址无法访问'
            return callback
    def post_list(url,dataarry):
        callback = {}
        a =0
        for data in dataarry:
            r=post(url,data)
            if r['code']==1:
                a=a+1
        callback['code']=1
        callback['successnum']=a
        return callback

    ucms 的代码

  • admin 2022-10-22
    3
    <?php
    if(!defined('ClassCms')) {exit();}
    class editortools {
        function route(){
            $routes=array();
            $routes[]=array('hash'=>'post','uri'=>'/et_post/','function'=>'post','enabled'=>1);
            Return $routes;
        }
        function post() {
            if(!count($_POST)) {
                echo('error');
                Return ;
            }
            $article=$_POST;
            foreach ($article as $key => $value) {
                if(!is_hash($key)){
                    unset($article[$key]);
                }
            }
            if(!isset($article['cid']) && isset($_GET['cid'])) {
                $article['cid']=$_GET['cid'];
            }
            if(!isset($article['cid'])) {
                echo('no cid');
                Return ;
            }
            if(!$channel=C('cms:channel:get',$article['cid'])) {
                echo('no channel');
                Return ;
            }
            if(config('login')) {
                $userid=C('admin:nowUser');
                if(!$userid) {
                    echo('no login');
                    Return ;
                }
                if(config('powercheck')) {
                    $channel=C('admin:article:channelGet',$article['cid']);
                    if(!C('admin:check',C('cms:module:authStr',$channel['_module'],'add'))) {
                        echo('no power');
                        Return ;
                    }
                }
            }
            if(config('repostcheck') && isset($article['title'])) {
                $article_query=array('cid'=>$article['cid'],'where'=>array('title'=>$article['title']));
                if(C('cms:article:getOne',$article_query)) {
                    echo('repost');
                    Return ;
                }
            }
            $id=C('cms:article:add',$article);
            if(is_numeric($id)) {
                echo('[ok]');
            }else {
                echo('insert error');
            }
            Return true;
        }
        function config() {
            $configs=array();
            $configs[]=array('configname'=>'登入','hash'=>'login','inputhash'=>'switch','tips'=>'关闭后,不需要登入即可通过接口发布文章,请注意安全问题!!!默认使用模拟发布,请在发布前获取网站cookie','tabname'=>'','defaultvalue'=>1);
            $configs[]=array('configname'=>'权限判断','hash'=>'powercheck','inputhash'=>'switch','tips'=>'登入后,判断相应登入用户是否有对应栏目的发布权限','tabname'=>'','defaultvalue'=>1);
            $configs[]=array('configname'=>'判断重复','hash'=>'repostcheck','inputhash'=>'switch','tips'=>'判断标题字段(title)是否重复,如果重复则发布失败','tabname'=>'','defaultvalue'=>0);
            Return $configs;
        }
    }

    classcms  post 文件

  • admin 2022-10-22
    4
    # 导入需要的库
    import requests
    from lxml import etree
    import pymysql
    import time
    import ucms
    # 文章详情信息类
    posturl='https://xxxx.net/et_post/'
    class articleData():
        def __init__(self, title, url, content):
            self.title = title #文章名称
            self.url = url #文章摘要
            self.content = content #文章路径
            
        def to_string(self):
            print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content)
    #保存狗狗详情数据
    #保存数据
    def saveData(title,url,content):
        count = pymysql.connect(
            host='127.0.0.1',  # 数据库地址
            port=3306,  # 数据库端口
            user='root',  # 数据库账号
            password='usbw',  # 数据库密码
            db='test'  # 数据库名
        )
        # 创建数据库对象
        db = count.cursor()
        # 写入sql
        # print("写入数据:"+DataObject.to_string())
        sql = f"insert into info(title,url,content) " \
              f"values ('{title}','{url}','{content}')"
        # 执行sql
    #     print(sql)
        db.execute(sql)
        # 保存修改内容
        count.commit()
        db.close()
    # 爬取数据的方向
    def getWebData():
        for i in range (2,10):
            
                    # 网站页面路径
            url = "http://m.mingyihui.net/s/sitemap_new_%s.html" % i
            # 请求头,模拟浏览器请求
            header = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
            }
            
            # 获取页面所有节点代码
            html = requests.get(url, headers=header)
            # 打印页面代码查看
            # print(html.text)
            # 如果乱码可以设置编码格式
            html.encoding = 'UTF-8'
            # 通过xpath获取数据对应节点
            etreeHtml = etree.HTML(html.text)
    #         # ''.join()是将内容转换为字符串可以后面接replace数据进行处理
            title = etreeHtml.xpath('/html/body/div[1]/div/div[1]/ul//a/text()')#文章标题
    #         abstract = ''.join(_.xpath('./article/a/div[2]/text()'))#文章摘要
            url = etreeHtml.xpath('/html/body/div[1]/div/div[1]/ul//a/@href')#文章路径
            content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()')
            for j in range(len(url)):
                 try:
    #                  saveData(title[j],url[j],j)
    #                  print(title[j],url[j],j)
                     data={}
                     data['cid']=5
                     data['title']=title[j]
                     data['content']=title[j]
    #                  r=ucms.post(posturl,data)
                     print(j,title[j])
    #                  print(r['code'])
    #                  print(r['msg'])
                 except:
                     pass
            time.sleep(0.7)
    if __name__ == "__main__":
        getWebData()


  • admin 2022-10-22
    5
    # 导入需要的库
    import requests
    from lxml import etree
    import pymysql
    import time
    import ucms
    from lxml import html
    posturl='https://xxxx.net/et_post/'
    class articleData():
        def __init__(self, title, url, content):
            self.title = title #文章名称
            self.url = url #文章摘要
            self.content = content #文章路径
        def to_string(self):
            print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content)
    def saveData(title,url,content):
        count = pymysql.connect(
            host='127.0.0.1',  # 数据库地址
            port=3306,  # 数据库端口
            user='root',  # 数据库账号
            password='usbw',  # 数据库密码
            db='test'  # 数据库名
        )
        # 创建数据库对象
        db = count.cursor()
        # 写入sql
        # print("写入数据:"+DataObject.to_string())
        sql = f"insert into info(title,url,content) " \
              f"values ('{title}','{url}','{content}')"
        # 执行sql
    #     print(sql)
        db.execute(sql)
        # 保存修改内容
        count.commit()
        db.close()
    def text_create(name, msg):
        desktop_path = "./"  # 新创建的txt文件的存放路径
        full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
        file = open(full_path, 'w')
        file.write(msg)
        file.close()
    # 爬取数据的方向
    def getWebData():
        for i in range (2,379):
            url = "http://xhzy06.com/vodtype/20-%s.html" % i
            header = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
            }
            html = requests.get(url, headers=header)
            html.encoding = 'UTF-8'
            etreeHtml = etree.HTML(html.text)
    #         # ''.join()是将内容转换为字符串可以后面接replace数据进行处理
    #         title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题
    #         abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要
            url = etreeHtml.xpath('/html/body/div[2]/div[2]/ul//a[1]/@href')#文章路径
    #         content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()')
            for j in range(len(url)):
                try:
                    html = requests.get('https://xhzy01.com/'+url[j], headers=header)
                    html.encoding = 'UTF-8'
                    etreeHtml = etree.HTML(html.text)
                    title = etreeHtml.xpath('/html/body/div[3]/h1/text()')#文章标题
                    img = etreeHtml.xpath('/html/body/div[3]/div/div[1]/img/@src')#文章路径
                    info = etreeHtml.xpath('//*[@id="content"]/div/p/text()')#文章路径
                    mp4 = etreeHtml.xpath('//*[@id="kyxhm3u80"]/text()')#文章路径
                    x = mp4[0].split("$", 1)
                    content = etreeHtml.xpath('/html/body/div[3]/div/div[2]/p[4]/text()')#文章路径
                     #转为strin
                    data={}
                    data['cid']=8
                    data['title']=title[0]
                    data['pic']=img[0]
                    data['content']=content[0]+info[0]+"<br><iframe src='//mv.lmxx.net/player/m3u8.php?url="+x[1]+"' scrolling='0' frameborder='0' width='100%' height='380' allowfullscreen='allowfullscreen' mozallowfullscreen='mozallowfullscreen' msallowfullscreen='msallowfullscreen' oallowfullscreen='oallowfullscreen' webkitallowfullscreen='webkitallowfullscreen'></iframe><br>电影封面<br><img src="+img[0]+" >"
                    r=ucms.post(posturl,data)
                    print(j,title[0],x[1])
    #                  text_create('mytxt',str(content))
                except:
                     pass
            time.sleep(7)
    if __name__ == "__main__":
        getWebData()

    采集电影的

  • admin 2022-10-23
    6
    # 导入需要的库
    import requests
    from lxml import etree
    import pymysql
    import time
    import ucms
    import os
    # 文章详情信息类
    posturl='https://xxxx.net/et_post/'
    class articleData():
        def __init__(self, title, url, content):
            self.title = title #文章名称
            self.url = url #文章摘要
            self.content = content #文章路径
            
        def to_string(self):
            print("文章名称:"+self.title+";文章路径:"+self.url+";缩略图:"+self.content)
            
    def text_create(name, msg):
        desktop_path = "./"  # 新创建的txt文件的存放路径
        full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
        file = open(full_path, 'w')
        file.write(msg)
        file.close()
        
    def text_read(name):
        desktop_path = "./"  # 新创建的txt文件的存放路径
        full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
        file = open(full_path, 'r')
        msg=file.readline()
        print(msg)
        return msg
        file.close()
      
      
    #保存狗狗详情数据
    #保存数据
    def saveData(title,url,content):
        count = pymysql.connect(
            host='127.0.0.1',  # 数据库地址
            port=3306,  # 数据库端口
            user='root',  # 数据库账号
            password='usbw',  # 数据库密码
            db='test'  # 数据库名
        )
        # 创建数据库对象
        db = count.cursor()
        # 写入sql
        # print("写入数据:"+DataObject.to_string())
        sql = f"insert into info(title,url,content) " \
              f"values ('{title}','{url}','{content}')"
        # 执行sql
    #     print(sql)
        db.execute(sql)
        # 保存修改内容
        count.commit()
        db.close()
    # 爬取数据的方向
    def getWebData():
        page=text_read('mytxtfile')
        for i in range (int(page),60000):
            print("jingruxunhuan")
                    # 网站页面路径
            url = "http://www.2itcn.com/news/p_%s.html" % i
            # 请求头,模拟浏览器请求
            header = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
            }
            
            # 获取页面所有节点代码
            html = requests.get(url, headers=header)
            html.close()
            # 打印页面代码查看
            # print(html.text)
            # 如果乱码可以设置编码格式
            html.encoding = 'UTF-8'
            # 通过xpath获取数据对应节点
            etreeHtml = etree.HTML(html.text)
    #         # ''.join()是将内容转换为字符串可以后面接replace数据进行处理
    #         title = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/text()')#文章标题
    #         abstract = etreeHtml.xpath(('//*[@id="article"]/div[3]/div[1]/div/ul//a/img/@scr'))#文章摘要
            url = etreeHtml.xpath('//*[@id="article"]/div[3]/div[1]/div/ul//h3/a/@href')#文章路径
    #         content = etreeHtml.xpath('./article/a/div[3]/div/div[2]/text()')
            for j in range(len(url)):
                 try:
                     html = requests.get('http://m.2itcn.com/'+url[j], headers=header)
                     html.encoding = 'UTF-8'
                     etreeHtml = etree.HTML(html.text)
                     title = etreeHtml.xpath('//*[@id="article"]//h1/text()')#文章标题
                     img = etreeHtml.xpath('//*[@id="soft-info"]/img/@src')#文章路径
                     content = etreeHtml.xpath('//*[@id="detailse"]/p/text()')#文章路径
                     content1 = ''.join(i for i in content)
    #                  print(content1)
    #                  time.sleep(3)
                     data={}
                     data['cid']=102
                     data['img']='https://img-xc.oss-cn-beijing.aliyuncs.com'+img[0]
                     data['title']=title
                     data['content']=content1
                     r=ucms.post(posturl,data)
                     print(j,r,title)
                 except:
                     print("chucuo")
                     pass
            text_create('mytxtfile',str(i+1))
            
    if __name__ == "__main__":
        while 1:
            try:
                getWebData()
            except:
                time.sleep(60)
                pass


  • admin 2022-10-25
    7
    import requests
    from lxml import etree
    import pymysql
    import time
    import ucms
    import os
    # 文章详情信息类
    posturl='https://xxxx.net/et_post/'
    def text_create(name, msg):
        desktop_path = "./"  # 新创建的txt文件的存放路径
        full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
        file = open(full_path, 'w')
        file.write(msg)
        file.close()
        
    def text_read(name):
        desktop_path = "./"  # 新创建的txt文件的存放路径
        full_path = desktop_path + name + '.txt'  # 也可以创建一个.doc的word文档
        file = open(full_path, 'r')
        msg=file.readline()
        print(msg)
        return msg
        file.close()
    def getWebData():
        page=text_read('myh')
        for i in range (int(page),60000):
            url = "http://m.mingyihui.net/doctor_%s/index.html" % i
    #         url = "http://m.mingyihui.net/doctor_1646844/index.html"
            # 请求头,模拟浏览器请求
    #         print(url)
            header = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
            }
            # print('123')
            # 获取页面所有节点代码
            html = requests.get(url, headers=header)
            html.close()
            html.encoding = 'UTF-8'
            # 打印页面代码查看
            etreeHtml = etree.HTML(html.text)
            img = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[1]/img/@src')#文章路径                       
            name = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[1]/span[1]/text()')#文章路径
            jibie = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[2]/span[1]/text()')#文章路径
            keshi = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[2]/span[2]/a/text()')#文章路径
            yiyuanurl = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[3]/a/@href')#文章路径
            yiyuanname = etreeHtml.xpath('/html/body//div[2]/div[1]/div[1]/div[2]/div[3]/a/text()')#文章路径
            shanchang = etreeHtml.xpath('/html/body//div[2]/div[1]/div[5]/div/text()')#文章路径
            jingli = etreeHtml.xpath('/html/body//div[2]/div[1]/div[6]/div[1]/text()')#文章路径
            print(img,name,jibie,keshi,yiyuanurl,yiyuanname)
            text_create('myh',str(i+1))
            data={}
            data['cid']=101
            data['title']=name
            data['content']=jingli
            data['pic']=img
            data['type']=jibie
            data['room']=keshi
            data['hospital']=yiyuanname
            data['export']=shanchang
            data['ago']=yiyuanurl
           
            if not name[0] is None:
                r=ucms.post(posturl,data)
            print(i,r,title)
            
    if __name__ == "__main__":
        while 1:
            try:
                getWebData()
            except:
                time.sleep(0)
                pass


返回