1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| import random import time from typing import Optional, Dict import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry
DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "no-cache", }
class StableFetcher: def __init__(self, base_delay: float = 0.6, max_delay: float = 8.0, proxies: Optional[Dict[str, str]] = None): self.sess = requests.Session() retry = Retry( total=5, connect=3, read=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET", "POST"], respect_retry_after_header=True, ) adapter = HTTPAdapter(max_retries=retry, pool_connections=50, pool_maxsize=50) self.sess.mount("http://", adapter) self.sess.mount("https://", adapter) self.base_delay = base_delay self.max_delay = max_delay self.proxies = proxies or {}
def _sleep_with_jitter(self, step: int): delay = min(self.base_delay * (2 ** (step - 1)), self.max_delay) jitter = random.uniform(0, delay * 0.25) time.sleep(delay + jitter)
def get(self, url: str, headers: Optional[Dict[str, str]] = None, max_attempts: int = 4) -> requests.Response: final_headers = {**DEFAULT_HEADERS, **(headers or {})} last_exc = None for i in range(1, max_attempts + 1): try: r = self.sess.get(url, headers=final_headers, proxies=self.proxies, timeout=15, allow_redirects=True) if r.status_code in (403, 429): self._sleep_with_jitter(i) continue return r except requests.RequestException as e: last_exc = e self._sleep_with_jitter(i) raise last_exc if last_exc else RuntimeError("请求失败且无异常信息")
if __name__ == "__main__": fetcher = StableFetcher(proxies=None) url = "https://example.com/search?q=python" resp = fetcher.get(url) print(resp.status_code, resp.url)
|