public static class HttpHelper { public const string UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"; public static HttpClientHandler Handler { get; } public static HttpClient Client { get; } static HttpHelper() { Handler = new HttpClientHandler(); Client = new HttpClient(Handler); Client.DefaultRequestHeaders.Add("User-Agent", UserAgent); } public static async Task<IHtmlDocument> GetHtmlDocument(string url) { var html = await Client.GetStringAsync(url); // todo 这个用法有内存泄漏问题,得优化一下 www.duidaima.com return new HtmlParser().ParseDocument(html); } public static async Task<IHtmlDocument> GetHtmlDocument(string url, string charset) { var res = await Client.GetAsync(url); var resBytes = await res.Content.ReadAsByteArrayAsync(); var resStr = Encoding.GetEncoding(charset).GetString(resBytes); // todo 这个用法有内存泄漏问题,得优化一下 return new HtmlParser().ParseDocument(resStr); } }这段代码里面有俩 todo ,这个内存泄漏的问题在简单的爬虫中影响不大,所以后面有大规模的需求再来优化吧~
IHtmlDocument data = await HttpHelper.GetHtmlDocument(url);拿到 IHtmlDocument 对象之后,用 QuerySelector 传入css选择器,就可以拿到各种元素了。例如这样,取出 <li> 元素下所有链接的地址
var data = await HttpHelper.GetHtmlDocument(url); foreach (var item in data.QuerySelectorAll(".pagew li")) { var link = item.QuerySelector("a"); var href = link?.GetAttribute("href"); if (href != null) await CrawlItem(href); }或者结合正则表达式
var data = await HttpHelper.GetHtmlDocument(url); var page = data.QuerySelector(".pageinfo"); Console.WriteLine("拿到分页信息:{0}", page?.TextContent); var match = Regex.Match(page?.TextContent ?? "", @"共\s(\d+)页(\d+)条"); var pageCount = int.Parse(match.Groups[1].Value); for (int i = 1; i <= pageCount; i++) { await CrawlPage(i); }正则表达式非常好用,爬虫必备~这里再推荐一个好用的东西,菜鸟工具的在线正则表达式测试,拿到一个字符串之后,先在测试器里面写出一个能匹配的正则,再放到程序里,效率更高~
var jsonOption = new JsonSerializerOptions { WriteIndented = true, Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping };写入文件
await File.WriteAllTextAsync("path", JsonSerializer.Serialize(data, jsonOption));
public static IDownloadService Downloader { get; } public static DownloadConfiguration DownloadConf => new DownloadConfiguration { BufferBlockSize = 10240, // 通常,主机最大支持8000字节,默认值为8000。 ChunkCount = 8, // 要下载的文件分片数量,默认值为1 // MaximumBytesPerSecond = 1024 * 50, // 下载速度限制,默认值为零或无限制 MaxTryAgainOnFailover = 5, // 失败的最大次数 ParallelDownload = true, // 下载文件是否为并行的。默认值为false Timeout = 1000, // 每个 stream reader 的超时(毫秒),默认值是1000 RequestConfiguration = { Accept = "*/*", AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, CookieContainer = new CookieContainer(), // Add your cookies Headers = new WebHeaderCollection(), // Add your custom headers KeepAlive = true, ProtocolVersion = HttpVersion.Version11, // Default value is HTTP 1.1 UseDefaultCredentials = false, UserAgent = UserAgent } }; static HttpHelper() { // ... Downloader = new DownloadService(DownloadConf); }使用方法依然是一行代码
await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath);不过这次没有直接封装一个下载的方法,而是把 IDownloadService 对象做成属性,因为下载的时候往往要加一些“buff”,比如监听下载进度,看下面的代码
HttpHelper.Downloader.DownloadStarted += DownloadStarted; HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted; HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged; HttpHelper.Downloader.ChunkDownloadProgressChanged += ChunkDownloadProgressChanged;这个库提供了四个事件,分别是:
4.分块下载进度变化
using var bar = new ProgressBar(10, "正在下载所有图片", BarOptions);上面代码定义了10个任务,每执行一次 bar.Tick() 就表示完成一次任务,执行10次后就整个完成~
var list = // 加载图集列表 using var bar = new ProgressBar(list.Count, "正在下载所有图片", BarOptions); foreach (var item in list) { bar.Message = $"图集:{item.Name}"; bar.Tick(); foreach (var imgUrl in item.Images) { using (var childBar = bar.Spawn(item.ImageCount,$"图片:{imgUrl}",ChildBarOptions)) { childBar.Tick(); // 具体的下载代码 } } }这样就实现了主进度条显示下载了第几个图集,子进度条显示下载到第几张图片。然后具体下载代码中,使用 Downloader 的事件监听,再 Spawn 一个新的进度条显示单张图片的下载进度。
private async Task Download(IProgressBar bar, string url, string filepath) { var percentageBar = bar.Spawn(100, $"正在下载:{Path.GetFileName(url)}", PercentageBarOptions); HttpHelper.Downloader.DownloadStarted += DownloadStarted; HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted; HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged; await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath); void DownloadStarted(object? sender, DownloadStartedEventArgs e) { Trace.WriteLine( $"图片, FileName:{Path.GetFileName(e.FileName)}, TotalBytesToReceive:{e.TotalBytesToReceive}"); } void DownloadFileCompleted(object? sender, AsyncCompletedEventArgs e) { Trace.WriteLine($"下载完成, filepath:{filepath}"); percentageBar.Dispose(); } void DownloadProgressChanged(object? sender, DownloadProgressChangedEventArgs e) { percentageBar.AsProgress<double>().Report(e.ProgressPercentage); } }注意所有的 ProgressBar 对象都需要用完释放,所以这里在 DownloadFileCompleted 事件里面 Dispose 了。上面的是直接用 using 语句,自动释放。
var barOptions = new ProgressBarOptions { ForegroundColor = ConsoleColor.Yellow, BackgroundColor = ConsoleColor.DarkYellow, ForegroundColorError = ConsoleColor.Red, ForegroundColorDone = ConsoleColor.Green, BackgroundCharacter = '\u2593', ProgressBarOnBottom = true, EnableTaskBarProgress = RuntimeInformation.IsOSPlatform(OSPlatform.Windows), DisplayTimeInRealTime = false, ShowEstimatedDuration = false };EnableTaskBarProgress 这个选项可以同时更新Windows任务状态栏上的进度具体配置选项可以直接看源码,里面注释很详细。如果 Spawn 出来的子进度条没配置选项,那就会继承上一级的配置。