python爬虫抓取新闻并且植入自己的mysql远程数据库内!这个代码是我自己写了很久才写好的,分享给大家。喜欢的点个赞。


# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
import datetime
import random

import pymysql
from selenium import webdriver
from lxml import etree
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


def strreplace_v1(old_str, key, value):
    # 替换某个字符串的一个或某几个字符串
    new_str = old_str.replace(key, value)
    return new_str


def get_page_source_html(driver, urlinfo):
    driver.get(urlinfo)
    page_text = driver.page_source
    tree = etree.HTML(page_text)
    return tree


def get_page_source_etree(driver):
    page_text = driver.page_source
    tree = etree.HTML(page_text)
    return tree


def get_list_a(etree, xpathinfo):
    return etree.xpath(xpathinfo)


def get_news_title(etree, xpathino):
    return etree.xpath(xpathino)


def get_news_content(etree, xpathino):
    return etree.xpath(xpathino)


def get_news_publish(etree, xpathino):
    return etree.xpath(xpathino)


def getUA():
    uaList = [
        # 360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
        # chrome
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",

        # firefox
       
02-05 02:26