主页 > 编程 > python >

python selenimu 今日头条采集类

2019-03-07 11:59 阅读:146 来源:智宇SEO自媒体

  python selenimu 今日头条采集

  贴上代码:

# -*- coding: utf-8 -*-
# @Time         : 2019/3/6 22:41
# @Author       : Zhiyu
# @File         : toutiao.py
# @Software     : PyCharm
# @Description  : 头条类
import time,os,io,json,platform,math,random,re

from selenium.webdriver.common.by import By
from selenium import webdriver
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class TouTiao:
    # 当前根目录
    root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    driver = None

    def __init__(self , debug = False):
        """
        初始化头条
        """
        # 启动chrome
        try:
            executable_path = self.root + '/chrome/chromedriver'  # Linux使用
            if platform.system() != 'Linux':
                executable_path = executable_path + '.exe'  # window使用exe
            print executable_path
            # chrome 配置项
            chrome_options = webdriver.ChromeOptions()
            if debug == False:
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--no-sandbox')
                chrome_options.add_argument('--disable-gpu')
                chrome_options.add_argument('--disable-setuid-sandbox')
                chrome_options.add_argument('--disable-dev-shm-usage')
            # 初始化chrome
            self.driver = webdriver.Chrome(executable_path, 0, chrome_options)
            # 屏幕最大化
            self.driver.maximize_window()
            # 读取COOKIE 并加如到浏览器中
            f = io.open(self.root + '/cookie/toutiao', 'r', encoding='utf-8')
            # 加载cookie前 需要先请求该域名任意链接
            self.driver.get("https://www.toutiao.com/")
            cookies = json.loads(f.read())
            for cookie in cookies:  # 添加cookie 字典
                self.driver.add_cookie(cookie)

            time.sleep(1)
        except Exception , e:
            print 'repr(e):\t', repr(e)
            ret = None
            print "头条初始化错误!"
            exit()


    def get_links(self , url):
        """
        采集链接
        :param url:
        :return:
        """
        try:
            print("采集 url %s" % url)
            self.driver.get(url)
            time.sleep(1)

            key_xpaths = [
                {"key": "url",
                 "xpath": '//*[@id="wrapper"]/div[2]/div[2]/div/div[3]/div/ul/li[*]/div/div[1]/div/div[1]/a'},  # 链接
                {"key": "read",
                 "xpath": '//*[@id="wrapper"]/div[2]/div[2]/div/div[3]/div/ul/li[*]/div/div[1]/div/div[2]/div[1]/a[1]'},
                # 阅读
                {"key": "comment",
                 "xpath": '//*[@id="wrapper"]/div[2]/div[2]/div/div[3]/div/ul/li[*]/div/div[1]/div/div[2]/div[1]/a[2]'},
                # 评论
                {"key": "img", "xpath": '//*[@id="wrapper"]/div[2]/div[2]/div/div[3]/div/ul/li[*]/div/div[2]/a/img'},
                # 图片
            ]
            # xpath 获取内容
            objs = {}  # 返回
            for item in key_xpaths:
                objs[item['key']] = []
                elements = self.driver.find_elements(By.XPATH, item['xpath'])
                for element in elements:
                    key = item['key']
                    if key == 'img':
                        objs[key].append(element.get_attribute("src"))
                    elif key == 'url':
                        objs[key].append(element.get_attribute("href"))
                    else:
                        objs[key].append(element.text)

            ret = [];
            k = 0

            for v in objs['url']:
                # 格式化阅读 评论数据
                read = objs['read'][k]
                pos = read.find('阅读')
                if pos == -1:
                    continue
                read = read[0:pos]
                if read.find('万'):
                    read = int(float(read[0:read.find('万')]) * 10000 + random.randint(1000, 9999))
                comment = objs['comment'][k]
                comment = comment[0:comment.find('评论')].strip()
                tmp = {
                    "url": v,
                    "read": read,
                    "comment": comment,
                    "img": objs['img'][k],
                }
                print(json.dumps(tmp).decode("unicode-escape"))
                ret.append(tmp)
                k = k + 1
        except:
            ret = None

        self.driver.close()
        self.driver.quit()
        self.driver = None
        return ret


    def get_content(self , url):
        """
        获取页面内容
        :param url:
        :return:
        """
        try:
            print("采集 url %s" % url)
            self.driver.get(url)
            time.sleep(1)
            abstract = self.driver.find_element_by_xpath("//meta[@name='description']").get_attribute("content")
            title = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/h1').text
            print title
            try:
                content = self.driver.find_element_by_xpath(
                    '/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/div').get_attribute("outerHTML")
            except:
                content = self.driver.find_element_by_xpath(
                    "/html/body/div[1]/div[2]/div[2]/div[1]/div[2]").get_attribute("outerHTML")

            print content
            tags = []

            tags_elements = self.driver.find_elements(By.XPATH,'/html/body/div[1]/div[2]/div[2]/div[1]/div[3]/div[1]/ul/li[*]/a')
            for element in tags_elements:
                tags.append(element.text)
            tags = json.dumps(tags).decode("unicode-escape")

            print tags

            ret = {
                "title" : title ,
                "content" : content ,
                "tags" : tags,
                "abstract" : abstract
            }
        except:
            ret = None

        self.driver.close()
        self.driver.quit()
        self.driver = None
        return ret


    def get_source(self,url):
        print("采集 url %s" % url)
        self.driver.get(url)
        time.sleep(1)
        try:
            xpath = "/html/body/div[1]/div[4]/div[2]/div[2]/div/div/div/ul/li[*]/div/div[1]/div/div[2]/div[1]/div/a[2]"
            elements = self.driver.find_elements(By.XPATH, xpath)
            source_uids = []
            for element in elements:
                url = element.get_attribute('href')
                if url.find("www.toutiao.com/c/user") > 0 :
                    source = element.text
                    tmp = {"url": url, "source": source}
                    source_uids.append(tmp)
                    print json.dumps(tmp).decode("unicode-escape")
            ret = source_uids
        except:
            ret =  None

        self.driver.close()
        self.driver.quit()
        self.driver = None
        return ret

    def get_source_uid(self, url):
        print("采集 url %s" % url)
        self.driver.get(url)
        time.sleep(1)
        try:
            page_source = self.driver.page_source
            xpath = '//*[@id="wrapper"]/div[1]/div/ul/li[1]/a/span[1]'
            source_name = self.driver.find_element_by_xpath(xpath).text
            searchObj = re.search(r"fensi:'(.*)',", page_source, re.M | re.I)
            if searchObj:
                fensi = searchObj.group(1)
            else:
                fensi = 0

            ret = {
                "source_name" : source_name,
                "fensi": fensi,
            }

        except Exception, e:
            print 'repr(e):\t', repr(e)
            ret = None

        self.driver.close()
        self.driver.quit()
        self.driver = None
        return ret



# toutiao =  TouTiao()
# links = toutiao.get_links("https://www.toutiao.com/c/user/74963784964/#mid=1583390130936845")
# content = toutiao.get_content("https://www.toutiao.com/item/6651915832546820611/")