公告

本教程仅供学习交流,如有侵权,请作者联系我进行删除

整体思路

首先构建请求头,用来防止网站的反爬虫措施

head = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}

关于请求头可以按F12找到NetWork界面查看

携带请求头访问URL,并将获取信息到的转化为html源码

html = requests.get(url, headers=head)
html = html.text

使用正则筛选出对应的博客URL

text = re.findall(r"<main>(.*?)</main>", html, re.S)

text1 = re.findall(r'<div class="article-list">(.*)</div>', str(text), re.S)

titleurls = re.findall(r'<a href="(.*?)" target="_blank">.*?</a>', str(text1), re.S)
table = re.findall(r'<table .*?>(.*?)</table>', html, re.S)
trlist = re.findall(r'<tr class.*?>(.*?)</tr>', str(table), re.S)
tdlist = re.findall(r'<td>(.*?)</td>', str(trlist), re.S)

具体的正则可根据获取的URL进行分析

整体代码

# Getip
# 获取代理IP
import re
import requests

def get(url):
    head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}

    html = requests.get(url, headers = head)
    html = html.text
    table = re.findall(r'<table .*?>(.*?)</table>', html, re.S)
    trlist = re.findall(r'<tr class.*?>(.*?)</tr>', str(table), re.S)
    tdlist = re.findall(r'<td>(.*?)</td>', str(trlist), re.S)
    iplist = {}
    j = 0
    i = 0
    while i < len(tdlist):
        iplist[j] = tdlist[i]
        i+=6
        j+=1
    return iplist
# GetTitleUrl
# 获取博客的URL
import requests
import re


def get(url):
    head = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
    html = requests.get(url, headers=head)
    html = html.text
    text = re.findall(r"<main>(.*?)</main>", html, re.S)
    text1 = re.findall(r'<div class="article-list">(.*)</div>', str(text), re.S)

    titleurls = re.findall(r'<a href="(.*?)" target="_blank">.*?</a>', str(text1), re.S)

    titleurl={}

    length = len(titleurls)
    length2 = int(length/2) 
    i = 0

    for j in range(length2):
        titleurl[j] = titleurls[i]
        i+=2
    return titleurl
# Main
# 主函数
import requests
import time
import threading

from src import GetTitleUrl
from src import GetIp


head = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
url = ""    #放入URL


def run(iplist, titleurl, num):
    while True:
        i = 0
        while i < len(iplist):
            j = i
            j%=len(titleurl)
            ip = {"http": iplist[i]}
            requests.get(titleurl[j], headers=head, proxies=ip)
            print("时间:" + time.asctime(time.localtime(time.time())), end=" ")
            print("线程:"+str(num), end=' ')
            print("使用IP:"+iplist[i], end=" ")
            print("访问博客:"+titleurl[j])
            i+=1

if __name__ == '__main__':
    titleurl = GetTitleUrl.get(url)
    iplist1 = GetIp.get("https://www.xicidaili.com/nn/1")
    iplist2 = GetIp.get("https://www.xicidaili.com/nn/2")
    iplist3 = GetIp.get("https://www.xicidaili.com/nn/3")
    iplist4 = GetIp.get("https://www.xicidaili.com/nn/4")
    iplist5 = GetIp.get("https://www.xicidaili.com/nn/5")
    iplist6 = GetIp.get("https://www.xicidaili.com/nn/6")
    t1 = threading.Thread(target=run, args=(iplist1, titleurl, 1,))
    t2 = threading.Thread(target=run, args=(iplist2, titleurl, 2,))
    t3 = threading.Thread(target=run, args=(iplist3, titleurl, 3,))
    t4 = threading.Thread(target=run, args=(iplist4, titleurl, 4,))
    t5 = threading.Thread(target=run, args=(iplist5, titleurl, 5,))
    t6 = threading.Thread(target=run, args=(iplist6, titleurl, 6,))
    t1.start()
    t2.start()
    t3.start()
    t4.start()
    t5.start()

一只小菜鸡