简单的Python爬虫代码演示

# auther : keepython
import requests
from lxml import etree
import time
import re


# 可可英语文章爬虫
def seconde_spider(page_url):
    page_html = requests.get(page_url)  # 用index_spider传入的文章地址开始爬取文章内容
    page_html.encoding = 'utf-8'  # 使用utf8形式进行编码
    selector = etree.HTML(page_html.text)  # 将编码后的网页内容处理为可以进行xpath筛选的形式
    title = selector.xpath('//h1[@id="nrtitle"]/text()')[0]  # 爬取文章的标题
    txt_file = title + '.txt'  # 使用字符串拼接的形式构建文件名称
    eng_text = selector.xpath('//div[@class="qh_en"]/p/text()')  # 提取英文文章内容
    ch_text = selector.xpath('//div[@class="qh_zg"]/p/text()')  # 提取中文文章内容
    time_stamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  # 使用time模块以对应的年-月日 小时 分钟 秒的形式
    file_path = '.\\data\\keke\\' + txt_file  # 构建文章地址，注意这里是绝对路径因为在centos中定时任务只能读取绝对路径
    with open(file_path, 'a') as f:  # 用with open的形式创建并打开文件
        # 接下来就是将爬取到的文章内容写入对应的txt文档中
        f.write(title)
        f.write('\n')
        f.write(time_stamp)
        f.write('\n')
        for i in eng_text:
            f.write(i)
            f.write('\n')
        f.write('\n\n')
        for j in ch_text:
            f.write(j)
            f.write('\n')
    return file_path  # 返回文档的路径


# 沪江英语文章爬虫，内容基本相同
def hujiang_second_spider(page_url):
    page_html = requests.get(page_url)
    page_html.encoding = 'utf-8'
    selector = etree.HTML(page_html.text)
    title = selector.xpath('//h1[@class="title"]/text()')[0]
    txt_file = title + '.txt'
    # eng_title = selector.xpath('//div[@class="langs_en"]/strong/text()')[0]
    eng_text = selector.xpath('//div[@class="langs_en"]/text()|//div[@class="langs_en"]/strong/text()')
    ch_text = selector.xpath('//div[@class="langs_cn"]/text()|div[@class="langs_cn"]/strong/text()')
    time_stamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

    file_path = './data/hujiang/' + txt_file
    with open(file_path, 'a') as f:
        f.write(title)
        f.write('\n')
        f.write(time_stamp)
        f.write('\n')
        for i in eng_text:
            f.write(i)
            f.write('\n')
        f.write('\n\n')
        for j in ch_text:
            f.write(j)
            f.write('\n')

    return file_path

评论 (0)