闽公网安备 35020302035485号
public static class HttpHelper {
public const string UserAgent =
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36";
public static HttpClientHandler Handler { get; }
public static HttpClient Client { get; }
static HttpHelper() {
Handler = new HttpClientHandler();
Client = new HttpClient(Handler);
Client.DefaultRequestHeaders.Add("User-Agent", UserAgent);
}
public static async Task<IHtmlDocument> GetHtmlDocument(string url) {
var html = await Client.GetStringAsync(url);
// todo 这个用法有内存泄漏问题,得优化一下 www.duidaima.com
return new HtmlParser().ParseDocument(html);
}
public static async Task<IHtmlDocument> GetHtmlDocument(string url, string charset) {
var res = await Client.GetAsync(url);
var resBytes = await res.Content.ReadAsByteArrayAsync();
var resStr = Encoding.GetEncoding(charset).GetString(resBytes);
// todo 这个用法有内存泄漏问题,得优化一下
return new HtmlParser().ParseDocument(resStr);
}
}
这段代码里面有俩 todo ,这个内存泄漏的问题在简单的爬虫中影响不大,所以后面有大规模的需求再来优化吧~IHtmlDocument data = await HttpHelper.GetHtmlDocument(url);拿到 IHtmlDocument 对象之后,用 QuerySelector 传入css选择器,就可以拿到各种元素了。例如这样,取出 <li> 元素下所有链接的地址
var data = await HttpHelper.GetHtmlDocument(url);
foreach (var item in data.QuerySelectorAll(".pagew li")) {
var link = item.QuerySelector("a");
var href = link?.GetAttribute("href");
if (href != null) await CrawlItem(href);
}
或者结合正则表达式var data = await HttpHelper.GetHtmlDocument(url);
var page = data.QuerySelector(".pageinfo");
Console.WriteLine("拿到分页信息:{0}", page?.TextContent);
var match = Regex.Match(page?.TextContent ?? "", @"共\s(\d+)页(\d+)条");
var pageCount = int.Parse(match.Groups[1].Value);
for (int i = 1; i <= pageCount; i++) {
await CrawlPage(i);
}
正则表达式非常好用,爬虫必备~这里再推荐一个好用的东西,菜鸟工具的在线正则表达式测试,拿到一个字符串之后,先在测试器里面写出一个能匹配的正则,再放到程序里,效率更高~var jsonOption = new JsonSerializerOptions {
WriteIndented = true,
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping
};
写入文件await File.WriteAllTextAsync("path", JsonSerializer.Serialize(data, jsonOption));
public static IDownloadService Downloader { get; }
public static DownloadConfiguration DownloadConf => new DownloadConfiguration {
BufferBlockSize = 10240, // 通常,主机最大支持8000字节,默认值为8000。
ChunkCount = 8, // 要下载的文件分片数量,默认值为1
// MaximumBytesPerSecond = 1024 * 50, // 下载速度限制,默认值为零或无限制
MaxTryAgainOnFailover = 5, // 失败的最大次数
ParallelDownload = true, // 下载文件是否为并行的。默认值为false
Timeout = 1000, // 每个 stream reader 的超时(毫秒),默认值是1000
RequestConfiguration = {
Accept = "*/*",
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate,
CookieContainer = new CookieContainer(), // Add your cookies
Headers = new WebHeaderCollection(), // Add your custom headers
KeepAlive = true,
ProtocolVersion = HttpVersion.Version11, // Default value is HTTP 1.1
UseDefaultCredentials = false,
UserAgent = UserAgent
}
};
static HttpHelper() {
// ...
Downloader = new DownloadService(DownloadConf);
}
使用方法依然是一行代码await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath);不过这次没有直接封装一个下载的方法,而是把 IDownloadService 对象做成属性,因为下载的时候往往要加一些“buff”,比如监听下载进度,看下面的代码
HttpHelper.Downloader.DownloadStarted += DownloadStarted; HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted; HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged; HttpHelper.Downloader.ChunkDownloadProgressChanged += ChunkDownloadProgressChanged;这个库提供了四个事件,分别是:
4.分块下载进度变化
using var bar = new ProgressBar(10, "正在下载所有图片", BarOptions);上面代码定义了10个任务,每执行一次 bar.Tick() 就表示完成一次任务,执行10次后就整个完成~
var list = // 加载图集列表
using var bar = new ProgressBar(list.Count, "正在下载所有图片", BarOptions);
foreach (var item in list) {
bar.Message = $"图集:{item.Name}";
bar.Tick();
foreach (var imgUrl in item.Images) {
using (var childBar = bar.Spawn(item.ImageCount,$"图片:{imgUrl}",ChildBarOptions)) {
childBar.Tick();
// 具体的下载代码
}
}
}
这样就实现了主进度条显示下载了第几个图集,子进度条显示下载到第几张图片。然后具体下载代码中,使用 Downloader 的事件监听,再 Spawn 一个新的进度条显示单张图片的下载进度。private async Task Download(IProgressBar bar, string url, string filepath) {
var percentageBar = bar.Spawn(100, $"正在下载:{Path.GetFileName(url)}", PercentageBarOptions);
HttpHelper.Downloader.DownloadStarted += DownloadStarted;
HttpHelper.Downloader.DownloadFileCompleted += DownloadFileCompleted;
HttpHelper.Downloader.DownloadProgressChanged += DownloadProgressChanged;
await HttpHelper.Downloader.DownloadFileTaskAsync(url, filepath);
void DownloadStarted(object? sender, DownloadStartedEventArgs e) {
Trace.WriteLine(
$"图片, FileName:{Path.GetFileName(e.FileName)}, TotalBytesToReceive:{e.TotalBytesToReceive}");
}
void DownloadFileCompleted(object? sender, AsyncCompletedEventArgs e) {
Trace.WriteLine($"下载完成, filepath:{filepath}");
percentageBar.Dispose();
}
void DownloadProgressChanged(object? sender, DownloadProgressChangedEventArgs e) {
percentageBar.AsProgress<double>().Report(e.ProgressPercentage);
}
}
注意所有的 ProgressBar 对象都需要用完释放,所以这里在 DownloadFileCompleted 事件里面 Dispose 了。上面的是直接用 using 语句,自动释放。var barOptions = new ProgressBarOptions {
ForegroundColor = ConsoleColor.Yellow,
BackgroundColor = ConsoleColor.DarkYellow,
ForegroundColorError = ConsoleColor.Red,
ForegroundColorDone = ConsoleColor.Green,
BackgroundCharacter = '\u2593',
ProgressBarOnBottom = true,
EnableTaskBarProgress = RuntimeInformation.IsOSPlatform(OSPlatform.Windows),
DisplayTimeInRealTime = false,
ShowEstimatedDuration = false
};
EnableTaskBarProgress 这个选项可以同时更新Windows任务状态栏上的进度具体配置选项可以直接看源码,里面注释很详细。如果 Spawn 出来的子进度条没配置选项,那就会继承上一级的配置。