Refactor downloaders to use generic options and unify logic
Replaces specialized binary and HTML downloaders with a generic, options-driven UnitDownloader and UnitFragmentDownloader pattern. Introduces UnitDownloaderOptions and builder classes for flexible configuration, updates interfaces and method signatures to support progress reporting, and removes redundant binary-specific classes. Updates Playwright and Stealth downloaders to use the new generic base, and adds improved error handling and reporting. Also updates dependency versions and makes minor API consistency improvements across the Fluent and Models layers.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
using Beam.Abstractions;
|
||||
using Beam.Models;
|
||||
using HtmlAgilityPack;
|
||||
using File = System.IO.File;
|
||||
|
||||
namespace Beam.Downloaders {
|
||||
/// <summary>
|
||||
@@ -10,34 +11,104 @@ namespace Beam.Downloaders {
|
||||
/// <param name="web"></param>
|
||||
/// <param name="transformer"></param>
|
||||
/// <param name="failurePredicate"></param>
|
||||
public class UnitDownloader<T>(HtmlWeb web, AsyncTransformer<HtmlDocument, T> transformer, AsyncDownloadFailurePredicate<HtmlDocument>?[]? failurePredicate = null) : IUnitDownloader<T> {
|
||||
public HtmlWeb Web { get; } = web;
|
||||
public virtual AsyncTransformer<HtmlDocument, T> Transformer { get; } = transformer;
|
||||
public virtual AsyncDownloadFailurePredicate<HtmlDocument>?[]? FailurePredicates { get; } = failurePredicate;
|
||||
public class UnitDownloader<RawType, OutType>(UnitDownloaderOptions<RawType, OutType> options) : IUnitDownloader<OutType> where RawType : IDocument {
|
||||
public UnitDownloaderOptions<RawType, OutType> Options { get; } = options;
|
||||
public HttpClient Client => Options.Client;
|
||||
public virtual AsyncTransformer<RawType, OutType> Transformer => Options.AsyncTransformer;
|
||||
|
||||
public virtual AsyncDownloadFailurePredicate<RawType>?[]? FailurePredicates =>
|
||||
Options?.FailurePredicateOptions?.AsyncDownloadFailurePredicates;
|
||||
|
||||
public int LinksPerDownload { get; } = 1;
|
||||
|
||||
protected virtual async Task<bool> IsFailure(HtmlDocument doc) {
|
||||
if (FailurePredicates is null)
|
||||
return false;
|
||||
var failed = false;
|
||||
await Parallel.ForEachAsync(FailurePredicates, async (x, ct) => {
|
||||
if (failed == true)
|
||||
return;
|
||||
if (x is null)
|
||||
return;
|
||||
if (await x(doc))
|
||||
failed = true;
|
||||
});
|
||||
protected virtual async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress,
|
||||
CancellationToken ct) {
|
||||
|
||||
return failed;
|
||||
var stream = await Client.GetStreamAsync(url, ct);
|
||||
byte[] buffer = new byte[bufferSize];
|
||||
int inBuffer = 0;
|
||||
long downloaded = 0;
|
||||
while ((inBuffer = stream.Read(buffer)) > 0) {
|
||||
downloaded += inBuffer;
|
||||
await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct);
|
||||
progress?.Report(new DownloadReport() {
|
||||
BytesDownloaded = inBuffer,
|
||||
BytesRemaining = stream.Length - downloaded
|
||||
});
|
||||
|
||||
ct.ThrowIfCancellationRequested();
|
||||
}
|
||||
}
|
||||
|
||||
protected virtual async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||
protected virtual async Task DownloadToFile(string url, int bufferSize, string path,
|
||||
IProgress<IDownloadReport> progress, CancellationToken ct) {
|
||||
|
||||
if (!Directory.Exists(Path.GetDirectoryName(path)))
|
||||
throw new InvalidOperationException(
|
||||
string.Format(Exceptions.Exceptions.unit_download_directory_nonexistant, path));
|
||||
await using var file = File.OpenWrite(path);
|
||||
await DownloadToStream(url, bufferSize, file, progress, ct);
|
||||
}
|
||||
|
||||
protected virtual async Task<ByteDocument> DownloadToMemory(string url, int bufferSize,
|
||||
IProgress<IDownloadReport> progress, CancellationToken ct) {
|
||||
|
||||
await using var ms = new MemoryStream();
|
||||
await DownloadToStream(url, bufferSize, ms, progress, ct);
|
||||
if (!ms.TryGetBuffer(out var bytes))
|
||||
throw new Exception(Exceptions.Exceptions.unit_download_invalid_memory_stream);
|
||||
return new ByteDocument(url, bytes);
|
||||
}
|
||||
|
||||
protected virtual async Task<bool> IsFailure(RawType doc, CancellationToken ct) {
|
||||
if (FailurePredicates is null)
|
||||
return false;
|
||||
if (!(Options?.FailurePredicateOptions?.ProcessInParallel ?? false))
|
||||
foreach (var pred in FailurePredicates) {
|
||||
if (pred is null)
|
||||
continue;
|
||||
if (await pred(doc))
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
var failed = false;
|
||||
await Parallel.ForEachAsync(FailurePredicates, new ParallelOptions() {
|
||||
MaxDegreeOfParallelism = Options?.FailurePredicateOptions?.ParallelThreads ?? 4,
|
||||
CancellationToken = ct
|
||||
},
|
||||
async (predicate, token) => {
|
||||
if (token.IsCancellationRequested)
|
||||
return;
|
||||
if (failed)
|
||||
return;
|
||||
if (predicate == null)
|
||||
return;
|
||||
if (await predicate(doc))
|
||||
Interlocked.CompareExchange(ref failed, true, false);
|
||||
}
|
||||
);
|
||||
return failed;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
protected virtual async Task<RawType> _Download(string link, IProgress<IDownloadReport> progress, CancellationToken ct) {
|
||||
if (Options.DownloadFolder is not null && this is UnitDownloader<StringDocument, OutType>) {
|
||||
var path = Path.Combine(Options.DownloadFolder, Path.GetRandomFileName());
|
||||
await DownloadToFile(link, Options.BufferSize, path, progress, ct);
|
||||
return (RawType)(object)new StringDocument(link, path);
|
||||
}
|
||||
if (this is UnitDownloader<ByteDocument, OutType>) {
|
||||
return (RawType)(object)(await DownloadToMemory(link, Options.BufferSize, progress, ct));
|
||||
}
|
||||
throw new NotSupportedException(Exceptions.Exceptions.unit_downloader_limited_support);
|
||||
}
|
||||
|
||||
protected virtual async Task<(bool, OutType?)> Transform(RawType download, CancellationToken ct) {
|
||||
try {
|
||||
var html = await Web.LoadFromWebAsync(link, ct);
|
||||
if (FailurePredicates is null || !(await IsFailure(html)))
|
||||
return (true, await Transformer(html));
|
||||
if (FailurePredicates is null || !(await IsFailure(download, ct)))
|
||||
return (true, await Transformer(download));
|
||||
else
|
||||
return (false, default);
|
||||
} catch(Exception) {
|
||||
@@ -45,23 +116,26 @@ namespace Beam.Downloaders {
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<(bool, T?)> TryDownload(IOrdered<string>[] link, CancellationToken ct, int maximumRetryCount = 7, IProgress<IRetryReport>? tryProgress = null) {
|
||||
public async Task<(bool, OutType?)> TryDownload(IOrdered<string>[] link, CancellationToken ct, int maximumRetryCount = 7, IProgress<IDownloadReport>? downProgress = null, IProgress<IRetryReport>? tryProgress = null) {
|
||||
if (link.Length == 0)
|
||||
return (false, default);
|
||||
|
||||
T? doc = default;
|
||||
downProgress ??= new Progress<IDownloadReport>();
|
||||
|
||||
OutType? ot = default;
|
||||
int tryCount = 0;
|
||||
while (tryCount < maximumRetryCount) {
|
||||
ct.ThrowIfCancellationRequested();
|
||||
(var success, doc) = await TryDownloadWithNoRetries(link[0].Data, ct);
|
||||
if (success && doc != null)
|
||||
return (true, doc);
|
||||
var rt = await _Download(link[0].Data, downProgress, ct);
|
||||
(var success, ot) = await Transform(rt, ct);
|
||||
if (success && ot != null)
|
||||
return (true, ot);
|
||||
++tryCount;
|
||||
tryProgress?.Report(new RetryReport(tryCount, link[0].Data));
|
||||
await Task.Delay((int)Math.Pow(2, tryCount) * 1000);
|
||||
}
|
||||
|
||||
return (false, doc);
|
||||
return (false, ot);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user