From 2958a26e4f720366b879af40628bac04e60a5dc0 Mon Sep 17 00:00:00 2001 From: qwsdcvghyu89 <61093706+qwsdcvghyu89@users.noreply.github.com> Date: Mon, 29 Sep 2025 21:27:56 +1000 Subject: [PATCH] Refactor downloaders to use generic options and unify logic Replaces specialized binary and HTML downloaders with a generic, options-driven UnitDownloader and UnitFragmentDownloader pattern. Introduces UnitDownloaderOptions and builder classes for flexible configuration, updates interfaces and method signatures to support progress reporting, and removes redundant binary-specific classes. Updates Playwright and Stealth downloaders to use the new generic base, and adds improved error handling and reporting. Also updates dependency versions and makes minor API consistency improvements across the Fluent and Models layers. --- Beam.Abstractions/Beam.Abstractions.csproj | 2 +- Beam.Abstractions/IDownloadReport.cs | 5 +- Beam.Abstractions/IUnitDownloader.cs | 2 +- Beam.Downloaders/Beam.Downloaders.csproj | 1 + Beam.Downloaders/SequentialDownloader.cs | 1 + Beam.Downloaders/UnitDownloader.cs | 128 ++++++++--- Beam.Downloaders/UnitDownloaderBinary.cs | 74 ------- Beam.Downloaders/UnitDownloaderOptions.cs | 198 ++++++++++++++++++ Beam.Downloaders/UnitFragmentDownloader.cs | 42 ++-- .../UnitFragmentDownloaderBinary.cs | 74 ------- Beam.Dynamic/Beam.Dynamic.csproj | 2 +- Beam.Exceptions/Exceptions.Designer.cs | 27 +++ Beam.Exceptions/Exceptions.resx | 9 + Beam.Fluent/Beam.Fluent.csproj | 3 +- Beam.Fluent/ContextStage.cs | 119 ++++------- Beam.Fluent/Core/FragmentMode.cs | 2 +- Beam.Fluent/DownloadStage.cs | 3 +- Beam.Fluent/FluentDownload.cs | 7 +- Beam.Fluent/TransformStage.cs | 5 +- Beam.Models/ByteDocument.cs | 17 +- Beam.Models/DownloadReport.cs | 4 +- Beam.Playwright/PlaywrightUnitDownloader.cs | 46 ++-- .../PlaywrightUnitPageDownloader.cs | 39 ---- Beam.Stealth/StealthFragmentDownloader.cs | 5 +- Beam.Stealth/StealthFragmentPageDownloader.cs | 5 +- Beam.Stealth/StealthUnitDownloader.cs | 46 ++-- Beam.Stealth/StealthUnitPageDownloader.cs | 28 +-- Beam/Beam.csproj | 4 +- aeqw89.Beam/aeqw89.Beam.csproj | 15 +- aeqw89.Beam/aeqw89.Beam.csproj.bak | 130 ++++++++++++ 30 files changed, 621 insertions(+), 422 deletions(-) delete mode 100644 Beam.Downloaders/UnitDownloaderBinary.cs create mode 100644 Beam.Downloaders/UnitDownloaderOptions.cs delete mode 100644 Beam.Downloaders/UnitFragmentDownloaderBinary.cs delete mode 100644 Beam.Playwright/PlaywrightUnitPageDownloader.cs create mode 100644 aeqw89.Beam/aeqw89.Beam.csproj.bak diff --git a/Beam.Abstractions/Beam.Abstractions.csproj b/Beam.Abstractions/Beam.Abstractions.csproj index cfe70e6..994b4c0 100644 --- a/Beam.Abstractions/Beam.Abstractions.csproj +++ b/Beam.Abstractions/Beam.Abstractions.csproj @@ -8,7 +8,7 @@ - + diff --git a/Beam.Abstractions/IDownloadReport.cs b/Beam.Abstractions/IDownloadReport.cs index 615b92f..3782238 100644 --- a/Beam.Abstractions/IDownloadReport.cs +++ b/Beam.Abstractions/IDownloadReport.cs @@ -1,3 +1,6 @@ namespace Beam.Abstractions; -public interface IDownloadReport { } \ No newline at end of file +public interface IDownloadReport { + long BytesDownloaded { get; init; } + long? BytesRemaining { get; init; } +} \ No newline at end of file diff --git a/Beam.Abstractions/IUnitDownloader.cs b/Beam.Abstractions/IUnitDownloader.cs index 736197b..c3e1daf 100644 --- a/Beam.Abstractions/IUnitDownloader.cs +++ b/Beam.Abstractions/IUnitDownloader.cs @@ -3,5 +3,5 @@ namespace Beam.Abstractions; public interface IUnitDownloader { public int LinksPerDownload { get; } - public Task<(bool, T?)> TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount = 7, IProgress? tryProgress = null); + public Task<(bool, T?)> TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount = 7, IProgress? downProgress = null, IProgress? tryProgress = null); } \ No newline at end of file diff --git a/Beam.Downloaders/Beam.Downloaders.csproj b/Beam.Downloaders/Beam.Downloaders.csproj index 3e5ee31..ab69f30 100644 --- a/Beam.Downloaders/Beam.Downloaders.csproj +++ b/Beam.Downloaders/Beam.Downloaders.csproj @@ -14,6 +14,7 @@ + diff --git a/Beam.Downloaders/SequentialDownloader.cs b/Beam.Downloaders/SequentialDownloader.cs index c9f586e..bc35f16 100644 --- a/Beam.Downloaders/SequentialDownloader.cs +++ b/Beam.Downloaders/SequentialDownloader.cs @@ -58,6 +58,7 @@ namespace Beam.Downloaders { var (result, downloadedT) = await unit.TryDownload( links.ToArray(), Context.CancellationToken, + downProgress: Context.DownloadReporter, tryProgress: Context.RetryReporter); if (!result) { diff --git a/Beam.Downloaders/UnitDownloader.cs b/Beam.Downloaders/UnitDownloader.cs index a2c192d..2d1c774 100644 --- a/Beam.Downloaders/UnitDownloader.cs +++ b/Beam.Downloaders/UnitDownloader.cs @@ -1,6 +1,7 @@ using Beam.Abstractions; using Beam.Models; using HtmlAgilityPack; +using File = System.IO.File; namespace Beam.Downloaders { /// @@ -10,34 +11,104 @@ namespace Beam.Downloaders { /// /// /// - public class UnitDownloader(HtmlWeb web, AsyncTransformer transformer, AsyncDownloadFailurePredicate?[]? failurePredicate = null) : IUnitDownloader { - public HtmlWeb Web { get; } = web; - public virtual AsyncTransformer Transformer { get; } = transformer; - public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates { get; } = failurePredicate; + public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader where RawType : IDocument { + public UnitDownloaderOptions Options { get; } = options; + public HttpClient Client => Options.Client; + public virtual AsyncTransformer Transformer => Options.AsyncTransformer; + + public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates => + Options?.FailurePredicateOptions?.AsyncDownloadFailurePredicates; public int LinksPerDownload { get; } = 1; - protected virtual async Task IsFailure(HtmlDocument doc) { - if (FailurePredicates is null) - return false; - var failed = false; - await Parallel.ForEachAsync(FailurePredicates, async (x, ct) => { - if (failed == true) - return; - if (x is null) - return; - if (await x(doc)) - failed = true; - }); + protected virtual async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, + CancellationToken ct) { - return failed; + var stream = await Client.GetStreamAsync(url, ct); + byte[] buffer = new byte[bufferSize]; + int inBuffer = 0; + long downloaded = 0; + while ((inBuffer = stream.Read(buffer)) > 0) { + downloaded += inBuffer; + await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct); + progress?.Report(new DownloadReport() { + BytesDownloaded = inBuffer, + BytesRemaining = stream.Length - downloaded + }); + + ct.ThrowIfCancellationRequested(); + } } - protected virtual async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { + protected virtual async Task DownloadToFile(string url, int bufferSize, string path, + IProgress progress, CancellationToken ct) { + + if (!Directory.Exists(Path.GetDirectoryName(path))) + throw new InvalidOperationException( + string.Format(Exceptions.Exceptions.unit_download_directory_nonexistant, path)); + await using var file = File.OpenWrite(path); + await DownloadToStream(url, bufferSize, file, progress, ct); + } + + protected virtual async Task DownloadToMemory(string url, int bufferSize, + IProgress progress, CancellationToken ct) { + + await using var ms = new MemoryStream(); + await DownloadToStream(url, bufferSize, ms, progress, ct); + if (!ms.TryGetBuffer(out var bytes)) + throw new Exception(Exceptions.Exceptions.unit_download_invalid_memory_stream); + return new ByteDocument(url, bytes); + } + + protected virtual async Task IsFailure(RawType doc, CancellationToken ct) { + if (FailurePredicates is null) + return false; + if (!(Options?.FailurePredicateOptions?.ProcessInParallel ?? false)) + foreach (var pred in FailurePredicates) { + if (pred is null) + continue; + if (await pred(doc)) + return true; + } + else { + var failed = false; + await Parallel.ForEachAsync(FailurePredicates, new ParallelOptions() { + MaxDegreeOfParallelism = Options?.FailurePredicateOptions?.ParallelThreads ?? 4, + CancellationToken = ct + }, + async (predicate, token) => { + if (token.IsCancellationRequested) + return; + if (failed) + return; + if (predicate == null) + return; + if (await predicate(doc)) + Interlocked.CompareExchange(ref failed, true, false); + } + ); + return failed; + } + + return false; + } + + protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) { + if (Options.DownloadFolder is not null && this is UnitDownloader) { + var path = Path.Combine(Options.DownloadFolder, Path.GetRandomFileName()); + await DownloadToFile(link, Options.BufferSize, path, progress, ct); + return (RawType)(object)new StringDocument(link, path); + } + if (this is UnitDownloader) { + return (RawType)(object)(await DownloadToMemory(link, Options.BufferSize, progress, ct)); + } + throw new NotSupportedException(Exceptions.Exceptions.unit_downloader_limited_support); + } + + protected virtual async Task<(bool, OutType?)> Transform(RawType download, CancellationToken ct) { try { - var html = await Web.LoadFromWebAsync(link, ct); - if (FailurePredicates is null || !(await IsFailure(html))) - return (true, await Transformer(html)); + if (FailurePredicates is null || !(await IsFailure(download, ct))) + return (true, await Transformer(download)); else return (false, default); } catch(Exception) { @@ -45,23 +116,26 @@ namespace Beam.Downloaders { } } - public async Task<(bool, T?)> TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount = 7, IProgress? tryProgress = null) { + public async Task<(bool, OutType?)> TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount = 7, IProgress? downProgress = null, IProgress? tryProgress = null) { if (link.Length == 0) return (false, default); - T? doc = default; + downProgress ??= new Progress(); + + OutType? ot = default; int tryCount = 0; while (tryCount < maximumRetryCount) { ct.ThrowIfCancellationRequested(); - (var success, doc) = await TryDownloadWithNoRetries(link[0].Data, ct); - if (success && doc != null) - return (true, doc); + var rt = await _Download(link[0].Data, downProgress, ct); + (var success, ot) = await Transform(rt, ct); + if (success && ot != null) + return (true, ot); ++tryCount; tryProgress?.Report(new RetryReport(tryCount, link[0].Data)); await Task.Delay((int)Math.Pow(2, tryCount) * 1000); } - return (false, doc); + return (false, ot); } } } diff --git a/Beam.Downloaders/UnitDownloaderBinary.cs b/Beam.Downloaders/UnitDownloaderBinary.cs deleted file mode 100644 index f32395e..0000000 --- a/Beam.Downloaders/UnitDownloaderBinary.cs +++ /dev/null @@ -1,74 +0,0 @@ -using Beam.Abstractions; -using Beam.Models; - -namespace Beam.Downloaders { - /// - /// A download-managing class that retrieves binary data through , - /// applies an , and supports failure detection - /// plus exponential-back-off retries. Safe to instantiate per request. - /// - public class UnitDownloaderBinary( - HttpClient client, - AsyncTransformer transformer, - AsyncDownloadFailurePredicate?[]? failurePredicates = null) - : IUnitDownloader { - public HttpClient Client { get; } = client; - public virtual AsyncTransformer Transformer { get; } = transformer; - public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates { get; } = failurePredicates; - - public int LinksPerDownload { get; } = 1; - - /// Runs all configured failure predicates in parallel on the raw HTTP response. - protected virtual async Task IsFailure(ByteDocument response) { - if (FailurePredicates is null) return false; - - var failed = false; - await Parallel.ForEachAsync(FailurePredicates, async (pred, ct) => { - if (failed || pred is null) return; - if (await pred(response)) - failed = true; - }); - return failed; - } - - /// One attempt without retries or back-off. - protected virtual async Task<(bool Success, T? Result)> TryDownloadWithNoRetries(string link, CancellationToken ct) { - try { - using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct); - if (!response.IsSuccessStatusCode) return (false, default); - - var bytes = await response.Content.ReadAsByteArrayAsync(ct); - var doc = new ByteDocument(link, bytes); - if (await IsFailure(doc)) return (false, default); - - return (true, await Transformer(doc)); - } catch { - return (false, default); - } - } - - public async Task<(bool, T?)> TryDownload( - IOrdered[] link, - CancellationToken ct, - int maximumRetryCount = 7, - IProgress? tryProgress = null) { - if (link.Length == 0) return (false, default); - - T? result = default; - var attempt = 0; - - while (attempt < maximumRetryCount) { - ct.ThrowIfCancellationRequested(); - - (var success, result) = await TryDownloadWithNoRetries(link[0].Data, ct); - if (success && result is not null) return (true, result); - - ++attempt; - tryProgress?.Report(new RetryReport(attempt, link[0].Data)); - await Task.Delay((int)Math.Pow(2, attempt) * 1000, ct); - } - - return (false, result); - } - } -} diff --git a/Beam.Downloaders/UnitDownloaderOptions.cs b/Beam.Downloaders/UnitDownloaderOptions.cs new file mode 100644 index 0000000..dc99555 --- /dev/null +++ b/Beam.Downloaders/UnitDownloaderOptions.cs @@ -0,0 +1,198 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public record class UnitDownloaderOptions { + public HttpClient Client { get; init; } = new(); + + public FailurePredicateOptions? FailurePredicateOptions { get; init; } + public FragmentOptions? FragmentOptions { get; init; } + public required AsyncTransformer AsyncTransformer { get; init; } + public string? DownloadFolder { get; init; } = null; + public int BufferSize { get; init; } = 80 * 1024; // 80kb +} + +public record class FailurePredicateOptions { + public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; } + public bool ProcessInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} + +public record class FragmentOptions { + public required int FragmentSize { get; init; } + public bool DownloadInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} + + + // ---------- UnitDownloaderOptions Builder ---------- + public sealed class UnitDownloaderOptionsBuilder + { + private HttpClient _client = new HttpClient(); + private FailurePredicateOptions? _failureOptions; + private FragmentOptions? _fragmentOptions; + private AsyncTransformer? _asyncTransformer; + private string? _downloadFolder = null; + private int _bufferSize = 80 * 1024; + + public UnitDownloaderOptionsBuilder WithClient(HttpClient client) + { + _client = client ?? throw new System.ArgumentNullException(nameof(client)); + return this; + } + + public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options) + { + _failureOptions = options; + return this; + } + + public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure) + { + if (configure == null) throw new System.ArgumentNullException(nameof(configure)); + var b = new FailurePredicateOptionsBuilder(); + configure(b); + _failureOptions = b.Build(); + return this; + } + + public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options) + { + _fragmentOptions = options; + return this; + } + + public UnitDownloaderOptionsBuilder WithFragments(System.Action configure) + { + if (configure == null) throw new System.ArgumentNullException(nameof(configure)); + var b = new FragmentOptionsBuilder(); + configure(b); + _fragmentOptions = b.Build(); + return this; + } + + public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer) + { + _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer)); + return this; + } + + public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder) + { + _downloadFolder = downloadFolder; + return this; + } + + public UnitDownloaderOptionsBuilder WithBufferSize(int bytes) + { + if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); + _bufferSize = bytes; + return this; + } + + public UnitDownloaderOptions Build() + { + if (_asyncTransformer == null) + throw new System.InvalidOperationException("AsyncTransformer must be provided."); + + return new UnitDownloaderOptions + { + Client = _client, + FailurePredicateOptions = _failureOptions, + FragmentOptions = _fragmentOptions, + AsyncTransformer = _asyncTransformer, + DownloadFolder = _downloadFolder, + BufferSize = _bufferSize + }; + } + } + + // ---------- FailurePredicateOptions Builder ---------- + public sealed class FailurePredicateOptionsBuilder + { + private readonly System.Collections.Generic.List?> _predicates = + new System.Collections.Generic.List?>(); + private bool _processInParallel = false; + private int? _parallelThreads = null; + + public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate) + { + _predicates.Add(predicate); + return this; + } + + public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates) + { + if (predicates == null) throw new System.ArgumentNullException(nameof(predicates)); + _predicates.AddRange(predicates); + return this; + } + + public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates) + { + _predicates.Clear(); + if (predicates != null) _predicates.AddRange(predicates); + return this; + } + + public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true) + { + _processInParallel = value; + return this; + } + + public FailurePredicateOptionsBuilder WithParallelThreads(int? threads) + { + if (threads.HasValue && threads.Value <= 0) + throw new System.ArgumentOutOfRangeException(nameof(threads)); + _parallelThreads = threads; + return this; + } + + public FailurePredicateOptions Build() + { + var arr = _predicates.Count == 0 ? [] : _predicates.ToArray(); + return new FailurePredicateOptions + { + AsyncDownloadFailurePredicates = arr, + ProcessInParallel = _processInParallel, + ParallelThreads = _parallelThreads + }; + } + } + + // ---------- FragmentOptions Builder ---------- + public sealed class FragmentOptionsBuilder { + private int? _fragmentSize; + private bool _downloadInParallel = false; + private int? _parallelThreads = null; + + public FragmentOptionsBuilder WithFragmentSize(int bytes) { + if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); + _fragmentSize = bytes; + return this; + } + + public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) { + _downloadInParallel = value; + return this; + } + + public FragmentOptionsBuilder WithParallelThreads(int? threads) { + if (threads.HasValue && threads.Value <= 0) + throw new System.ArgumentOutOfRangeException(nameof(threads)); + _parallelThreads = threads; + return this; + } + + public FragmentOptions Build() { + if (!_fragmentSize.HasValue) + throw new System.InvalidOperationException("FragmentSize must be provided."); + + return new FragmentOptions { + FragmentSize = _fragmentSize.Value, + DownloadInParallel = _downloadInParallel, + ParallelThreads = _parallelThreads + }; + } + } diff --git a/Beam.Downloaders/UnitFragmentDownloader.cs b/Beam.Downloaders/UnitFragmentDownloader.cs index c834ab3..a399128 100644 --- a/Beam.Downloaders/UnitFragmentDownloader.cs +++ b/Beam.Downloaders/UnitFragmentDownloader.cs @@ -5,55 +5,39 @@ using HtmlAgilityPack; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class UnitFragmentDownloader : IUnitDownloader>> { - public UnitFragmentDownloader(HtmlWeb web, - AsyncTransformer transformer, - AsyncDownloadFailurePredicate?[]? failurePredicate = null, - int fragmentSize = 4, - ILogger? logger = null, - IUnitDownloader? internalDownloader = null) { - Web = web; - Transformer = transformer; - FailurePredicate = failurePredicate; - UnitDownloader = internalDownloader ?? new UnitDownloader(Web, Transformer, FailurePredicate); - LinksPerDownload = fragmentSize; - Logger = logger; - } + public class UnitFragmentDownloader(UnitDownloaderOptions options, + IUnitDownloader? internalDownloader = null) : IUnitDownloader>> where RawType : IDocument { - public HtmlWeb Web { get; } - public AsyncTransformer Transformer { get; } - public AsyncDownloadFailurePredicate?[]? FailurePredicate { get; } + public UnitDownloaderOptions Options { get; } = options; public int LinksPerDownload { get; set; } - public ILogger? Logger { get; set; } + private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options); - private readonly IUnitDownloader UnitDownloader; - - async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount, IProgress? tryProgress) { - Fragment> fragment = new Fragment>(link.Length); - if (!Fragment>.TryAcquireUpdater(fragment, out var updater)) + async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount, IProgress? downProgress, IProgress? tryProgress) { + Fragment> fragment = new Fragment>(link.Length); + if (!Fragment>.TryAcquireUpdater(fragment, out var updater)) throw new AssertionException(Exceptions.Exceptions.fragment_locked); bool isFailure = false; await Parallel.ForEachAsync(link, async (x, pct) => { pct.ThrowIfCancellationRequested(); ct.ThrowIfCancellationRequested(); - var (result, downloadedT) = await UnitDownloader.TryDownload([x], ct, maximumRetryCount, tryProgress); + if (isFailure) + return; + var (result, downloadedT) = await UnitDownloader.TryDownload([x], ct, maximumRetryCount, downProgress, tryProgress); if (!result) { Interlocked.Exchange(ref isFailure, true); - Logger?.LogError("Failed to retrieve {0} order={1}", x.Data, x.Order); return; } if (downloadedT == null) { Interlocked.Exchange(ref isFailure, true); - Logger?.LogCritical("Failed to retrieve {0} order={1}", x.Data, x.Order); return; } - updater(new Ordered(downloadedT, x.Order)); + updater(new Ordered(downloadedT, x.Order)); }); if (!isFailure) - Fragment>.SetComplete(fragment, true); + Fragment>.SetComplete(fragment, true); - Fragment>.TryReleaseUpdater(fragment, updater); + Fragment>.TryReleaseUpdater(fragment, updater); return (!isFailure, fragment); diff --git a/Beam.Downloaders/UnitFragmentDownloaderBinary.cs b/Beam.Downloaders/UnitFragmentDownloaderBinary.cs deleted file mode 100644 index f5c687d..0000000 --- a/Beam.Downloaders/UnitFragmentDownloaderBinary.cs +++ /dev/null @@ -1,74 +0,0 @@ -using Beam.Abstractions; -using Beam.Exceptions; -using Beam.Models; -using Microsoft.Extensions.Logging; - -namespace Beam.Downloaders { - /// - /// Groups multiple binary downloads into a single Fragment, applying - /// failure detection and exponential-back-off retries for each link. - /// - public class UnitFragmentDownloaderBinary - : IUnitDownloader>> { - public UnitFragmentDownloaderBinary(HttpClient client, - AsyncTransformer transformer, - AsyncDownloadFailurePredicate?[]? failurePredicate = null, - int fragmentSize = 4, - ILogger? logger = null, - IUnitDownloader? internalDownloader = null) { - Client = client; - Transformer = transformer; - FailurePredicate = failurePredicate; - UnitDownloader = internalDownloader - ?? new UnitDownloaderBinary(Client, Transformer, FailurePredicate); - LinksPerDownload = fragmentSize; - Logger = logger; - } - - public HttpClient Client { get; } - public AsyncTransformer Transformer { get; } - public AsyncDownloadFailurePredicate?[]? FailurePredicate { get; } - public int LinksPerDownload { get; set; } - public ILogger? Logger { get; set; } - - private readonly IUnitDownloader UnitDownloader; - - async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload( - IOrdered[] link, - CancellationToken ct, - int maximumRetryCount, - IProgress? tryProgress) { - var fragment = new Fragment>(link.Length); - if (!Fragment>.TryAcquireUpdater(fragment, out var updater)) - throw new AssertionException(Exceptions.Exceptions.fragment_locked); - - var isFailure = false; - - await Parallel.ForEachAsync(link, async (orderedLink, pct) => { - pct.ThrowIfCancellationRequested(); - ct.ThrowIfCancellationRequested(); - - var (success, downloaded) = - await UnitDownloader.TryDownload([orderedLink], - ct, - maximumRetryCount, - tryProgress); - - if (!success || downloaded is null) { - Interlocked.Exchange(ref isFailure, true); - Logger?.LogError("Failed to retrieve {Link} order={Order}", - orderedLink.Data, orderedLink.Order); - return; - } - - updater(new Ordered(downloaded, orderedLink.Order)); - }); - - if (!isFailure) - Fragment>.SetComplete(fragment, true); - - Fragment>.TryReleaseUpdater(fragment, updater); - return (!isFailure, fragment); - } - } -} diff --git a/Beam.Dynamic/Beam.Dynamic.csproj b/Beam.Dynamic/Beam.Dynamic.csproj index 3d6d4c8..4f3aaeb 100644 --- a/Beam.Dynamic/Beam.Dynamic.csproj +++ b/Beam.Dynamic/Beam.Dynamic.csproj @@ -6,7 +6,7 @@ enable - + diff --git a/Beam.Exceptions/Exceptions.Designer.cs b/Beam.Exceptions/Exceptions.Designer.cs index 5b5cfc9..27d1901 100644 --- a/Beam.Exceptions/Exceptions.Designer.cs +++ b/Beam.Exceptions/Exceptions.Designer.cs @@ -157,5 +157,32 @@ namespace Beam.Exceptions { return ResourceManager.GetString("state_change_error", resourceCulture); } } + + /// + /// Looks up a localized string similar to Could not open a filestream to a non-existant directory '{0}'.. + /// + public static string unit_download_directory_nonexistant { + get { + return ResourceManager.GetString("unit_download_directory_nonexistant", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to The memory stream was created with an invisible inner byte array.. + /// + public static string unit_download_invalid_memory_stream { + get { + return ResourceManager.GetString("unit_download_invalid_memory_stream", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to The base unit downloader class only supports RawType's of string and ByteDocument.. + /// + public static string unit_downloader_limited_support { + get { + return ResourceManager.GetString("unit_downloader_limited_support", resourceCulture); + } + } } } diff --git a/Beam.Exceptions/Exceptions.resx b/Beam.Exceptions/Exceptions.resx index 0adb3c5..c80b7ad 100644 --- a/Beam.Exceptions/Exceptions.resx +++ b/Beam.Exceptions/Exceptions.resx @@ -51,4 +51,13 @@ There must be at least one state in resource definition. + + Could not open a filestream to a non-existant directory '{0}'. + + + The memory stream was created with an invisible inner byte array. + + + The base unit downloader class only supports RawType's of string and ByteDocument. + \ No newline at end of file diff --git a/Beam.Fluent/Beam.Fluent.csproj b/Beam.Fluent/Beam.Fluent.csproj index 2d6a5ab..5ccd49d 100644 --- a/Beam.Fluent/Beam.Fluent.csproj +++ b/Beam.Fluent/Beam.Fluent.csproj @@ -6,12 +6,11 @@ enable - + - diff --git a/Beam.Fluent/ContextStage.cs b/Beam.Fluent/ContextStage.cs index be079d9..c387c04 100644 --- a/Beam.Fluent/ContextStage.cs +++ b/Beam.Fluent/ContextStage.cs @@ -8,13 +8,14 @@ using Beam.Downloaders; namespace Beam.Fluent; -internal sealed class ContextStage : IContextStage { +internal sealed class ContextStage : IContextStage where RawType : IDocument { private readonly DownloadContextBuilder _ctxBuilder; private readonly AsyncTransformer _transformer; private FragmentMode _fragmentMode = FragmentMode.Single; private Channel _channel = Channel.Plain; private readonly ContentKind _contentKind; private int _parallelism = 4; + private UnitDownloaderOptionsBuilder _optionsBuilder = new(); // ──────────────── playwright ──────────────── private PlaywrightAsyncManipulator? _playwrightManipulator = null; @@ -31,12 +32,15 @@ internal sealed class ContextStage : IContextStage => ContentKind.Html, + AsyncTransformer => ContentKind.File, AsyncTransformer => ContentKind.Binary, _ => throw new ArgumentException(string.Format(Exceptions.Exceptions.fluent_unsupported_transformer, transformer.GetType() .AsUniqueName())) }; + + _optionsBuilder + .WithAsyncTransformer(_transformer); } public IContextStage Configure(Action> configure) { @@ -44,6 +48,12 @@ internal sealed class ContextStage : IContextStage ConfigureUnitDownloaderOptions( + Action> configure) { + configure(_optionsBuilder); + return this; + } + public IContextStage WithParallelism(int degree) { _parallelism = Math.Max(1, degree); return this; @@ -108,14 +118,14 @@ internal sealed class ContextStage : IContextStage HtmlTransformer() - => To>(_transformer); + AsyncTransformer FileTransformer() + => To>(_transformer); AsyncTransformer ByteTransformer() => To>(_transformer); - AsyncDownloadFailurePredicate[] HtmlFailurePredicates() - => To[]>(context.AsyncFailurePredicates); + AsyncDownloadFailurePredicate[] FileFailurePredicates() + => To[]>(context.AsyncFailurePredicates); AsyncDownloadFailurePredicate[] ByteFailurePredicates() => To[]>(context.AsyncFailurePredicates); @@ -125,82 +135,39 @@ internal sealed class ContextStage : IContextStage x.WithPredicates(context.AsyncFailurePredicates)); + var options = _optionsBuilder + .WithClient(context.Client) + .Build(); + return (_channel, _fragmentMode, _contentKind) switch { - // ──────────────── fragmented HTML ──────────────── - (Channel.Plain, FragmentMode.Fragmented, ContentKind.Html) - => new UnitFragmentDownloader( - context.Web, - HtmlTransformer(), - HtmlFailurePredicates(), - _parallelism, - context.DownloadLogger), - // ──────────────── fragmented binary ──────────────── - (Channel.Plain, FragmentMode.Fragmented, ContentKind.Binary) - => new UnitFragmentDownloaderBinary( - context.Client, - ByteTransformer(), - ByteFailurePredicates(), - _parallelism, - context.DownloadLogger), - // ──────────────── single HTML ──────────────── - (Channel.Plain, FragmentMode.Single, ContentKind.Html) - => new UnitDownloader( - context.Web, - HtmlTransformer(), - HtmlFailurePredicates()), - // ──────────────── single binary ──────────────── - (Channel.Plain, FragmentMode.Single, ContentKind.Binary) - => new UnitDownloaderBinary( - context.Client, - ByteTransformer(), - ByteFailurePredicates()), - // ──────────────── single playwright binary ──────────────── - (Channel.Playwright, FragmentMode.Single, ContentKind.Binary) - => new PlaywrightUnitDownloader( - context.Client, - EnsureExists(_playwrightManipulator), - ByteTransformer(), - ByteFailurePredicates() - ), - // ──────────────── single playwrigt HTML ──────────────── - (Channel.Playwright, FragmentMode.Single, ContentKind.Html) - => new PlaywrightUnitPageDownloader( - context.Web, - EnsureExists(_playwrightManipulator), - HtmlTransformer(), - HtmlFailurePredicates()), - // ──────────────── single stealth HTML ──────────────── - (Channel.Stealth, FragmentMode.Single, ContentKind.Html) - => new StealthUnitPageDownloader( - context.Web, - EnsureExists(_stealthConfig), - EnsureExists(_stealthManipulator), - HtmlTransformer(), - HtmlFailurePredicates()), - // ──────────────── single stealth binary ──────────────── + // ──────────────── fragmented ──────────────── + (Channel.Plain, FragmentMode.Fragmented, _) + => new UnitFragmentDownloader(options), + // ──────────────── single ──────────────── + (Channel.Plain, FragmentMode.Single, _) + => new UnitDownloader(options), + // ──────────────── single playwright ──────────────── + (Channel.Playwright, FragmentMode.Single, _) + => new PlaywrightUnitDownloader(options, EnsureExists(_playwrightManipulator)), + // ──────────────── single stealth file ──────────────── + (Channel.Stealth, FragmentMode.Single, ContentKind.File) + => new StealthUnitPageDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), + // ──────────────── single stealth binary ──────────────── (Channel.Stealth, FragmentMode.Single, ContentKind.Binary) - => new StealthUnitDownloader( - context.Client, + => new StealthUnitDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), + // ──────────────── fragment stealth file ──────────────── + (Channel.Stealth, FragmentMode.Fragmented, ContentKind.File) + => new StealthFragmentPageDownloader(options, EnsureExists(_stealthConfig), - EnsureExists(_stealthManipulator), - ByteTransformer(), - ByteFailurePredicates()), - // ──────────────── fragment stealth HTML ──────────────── - (Channel.Stealth, FragmentMode.Fragmented, ContentKind.Html) - => new StealthFragmentPageDownloader( - context.Web, - EnsureExists(_stealthConfig), - EnsureExists(_stealthManipulator), - HtmlTransformer(), - HtmlFailurePredicates()), + EnsureExists(_stealthManipulator)), // ──────────────── fragment stealth binary ──────────────── (Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary) - => new StealthFragmentDownloader( - context.Client, + => new StealthFragmentDownloader(options, EnsureExists(_stealthConfig), - EnsureExists(_stealthManipulator), - ByteTransformer(), - ByteFailurePredicates()), + EnsureExists(_stealthManipulator)), _ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern, $"({_channel}, {_fragmentMode}, {_contentKind})")), }; diff --git a/Beam.Fluent/Core/FragmentMode.cs b/Beam.Fluent/Core/FragmentMode.cs index a2e29b1..32aa7de 100644 --- a/Beam.Fluent/Core/FragmentMode.cs +++ b/Beam.Fluent/Core/FragmentMode.cs @@ -12,6 +12,6 @@ public enum Channel { } public enum ContentKind { - Html, + File, Binary } \ No newline at end of file diff --git a/Beam.Fluent/DownloadStage.cs b/Beam.Fluent/DownloadStage.cs index 3186cc1..4b3a896 100644 --- a/Beam.Fluent/DownloadStage.cs +++ b/Beam.Fluent/DownloadStage.cs @@ -1,10 +1,11 @@ using System.Collections.Concurrent; using System.Text.Json; +using Beam.Abstractions; using Beam.Models; namespace Beam.Fluent; -internal sealed class DownloadStage(DownloadEnumerable download) : IDownloadStage { +internal sealed class DownloadStage(DownloadEnumerable download) : IDownloadStage where RawType : IDocument { private IAsyncEnumerable> _download = download; public DownloadEnumerable AsAsyncEnumerable() { diff --git a/Beam.Fluent/FluentDownload.cs b/Beam.Fluent/FluentDownload.cs index 745329d..bce657a 100644 --- a/Beam.Fluent/FluentDownload.cs +++ b/Beam.Fluent/FluentDownload.cs @@ -1,4 +1,5 @@ using aeqw89.DataKeys; +using Beam.Abstractions; using Beam.Data; using Beam.Downloaders; using Beam.Dynamic; @@ -7,13 +8,13 @@ using Beam.Models; namespace Beam.Fluent; public static class FluentDownload { - public static ITransformStage Links(params IEnumerable links) { + public static ITransformStage Links(params IEnumerable links) where RawType : IDocument { return new TransformStage(new DownloadContextBuilder() .WithLinks(links)); } public static ITransformStage - ResourceDefinition(ResourceDefinition definition) { + ResourceDefinition(ResourceDefinition definition) where RawType : IDocument { if (definition.Location.States.Count == 0) throw new ArgumentException(Exceptions.Exceptions.resource_definition_invalid_states_count, nameof(definition)); var linkGenerator = new OrderedLinkGenerator(definition.Location.Segments, (NumberedStateChanger)definition.Location.StateChanger.Behavior, @@ -22,7 +23,7 @@ public static class FluentDownload { .WithLinks(StringEnumerable.FromGenerator(linkGenerator!))); } - public static ITransformStage FromContext(DownloadContext existing) { + public static ITransformStage FromContext(DownloadContext existing) where RawType : IDocument { return new TransformStage(DownloadContextBuilder.FromContext(existing)); } } \ No newline at end of file diff --git a/Beam.Fluent/TransformStage.cs b/Beam.Fluent/TransformStage.cs index 778c1c2..953ee14 100644 --- a/Beam.Fluent/TransformStage.cs +++ b/Beam.Fluent/TransformStage.cs @@ -1,11 +1,12 @@ -using Beam.Data; +using Beam.Abstractions; +using Beam.Data; using Beam.Downloaders; using Beam.Dynamic; using Beam.Models; namespace Beam.Fluent; -internal sealed class TransformStage(DownloadContextBuilder CtxBuilder) : ITransformStage { +internal sealed class TransformStage(DownloadContextBuilder CtxBuilder) : ITransformStage where RawType : IDocument { public IContextStage WithTransformer(AsyncTransformer transformer) { return new ContextStage(CtxBuilder, transformer); } diff --git a/Beam.Models/ByteDocument.cs b/Beam.Models/ByteDocument.cs index 33600b8..9be20e1 100644 --- a/Beam.Models/ByteDocument.cs +++ b/Beam.Models/ByteDocument.cs @@ -1,15 +1,24 @@ using System.Text; namespace Beam.Models { - public class ByteDocument(string filename, byte[] content, Encoding? encoding = null) : Document(filename, encoding) { - public byte[] Content { get; set; } = content; + public class ByteDocument : Document { + public ByteDocument(string filename, byte[] content, Encoding? encoding = null) : base(filename, encoding) { + Content = content; + } + public ByteDocument(string filename, Memory content, Encoding? encoding = null) : + base(filename, encoding) { + Content = content; + } + + public Memory Content { get; set; } + public override byte[] ToBytes() { - return Content; + return Content.ToArray(); } public override string ToString() { - return Encoding.GetString(Content); + return Encoding.GetString(Content.ToArray()); } } } diff --git a/Beam.Models/DownloadReport.cs b/Beam.Models/DownloadReport.cs index b27dc57..9464635 100644 --- a/Beam.Models/DownloadReport.cs +++ b/Beam.Models/DownloadReport.cs @@ -2,7 +2,9 @@ namespace Beam.Models { public struct DownloadReport : IDownloadReport { - // TODO implement download report + public long BytesDownloaded { get; init; } + public long? BytesRemaining { get; init; } + } } diff --git a/Beam.Playwright/PlaywrightUnitDownloader.cs b/Beam.Playwright/PlaywrightUnitDownloader.cs index e4276e8..8e203e8 100644 --- a/Beam.Playwright/PlaywrightUnitDownloader.cs +++ b/Beam.Playwright/PlaywrightUnitDownloader.cs @@ -1,36 +1,36 @@ -using Beam.Downloaders; +using Beam.Abstractions; +using Beam.Downloaders; using Beam.Models; using Microsoft.Playwright; namespace Beam.Playwright { - public class PlaywrightUnitDownloader : UnitDownloaderBinary { - public PlaywrightAsyncManipulator PuppetManipulator { get; } + public class PlaywrightUnitDownloader( + UnitDownloaderOptions options, + PlaywrightAsyncManipulator puppetManipulator) + : UnitDownloader(options) + where RawType : IDocument { + public PlaywrightAsyncManipulator PuppetManipulator { get; } = puppetManipulator; - public PlaywrightUnitDownloader(HttpClient client, PlaywrightAsyncManipulator puppetManipulator, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] asyncDownloadFailurePredicates) - : base(client, asyncHtmlTransformer, asyncDownloadFailurePredicates) { - PuppetManipulator = puppetManipulator; - } - - protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { + protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, CancellationToken ct) { var page = await PlaywrightContext.Browser.Value.NewPageAsync(); try { - await page.GotoAsync(link); + await page.GotoAsync(url); await PuppetManipulator(page); var download = await page.WaitForDownloadAsync(); - using var stream = await download.CreateReadStreamAsync(); - byte[] content = new byte[stream.Length]; - - await stream.ReadExactlyAsync(content, ct); - - ByteDocument doc = new ByteDocument(download.SuggestedFilename, content); - if (FailurePredicates is not null && await IsFailure(doc)) - return (false, default); - - var transformed = await Transformer(doc); - return (true, transformed); - } catch (Exception) { - return (false, default); + await using var stream = await download.CreateReadStreamAsync(); + var buffer = new byte[bufferSize]; + var inBuffer = 0; + var downloaded = 0; + while ((inBuffer = stream.Read(buffer)) > 0) { + downloaded += inBuffer; + progress?.Report(new DownloadReport() { + BytesDownloaded = downloaded, + BytesRemaining = stream.Length - downloaded + }); + await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct); + } + } finally { if (!page.IsClosed) await page.CloseAsync(); diff --git a/Beam.Playwright/PlaywrightUnitPageDownloader.cs b/Beam.Playwright/PlaywrightUnitPageDownloader.cs deleted file mode 100644 index 2642f00..0000000 --- a/Beam.Playwright/PlaywrightUnitPageDownloader.cs +++ /dev/null @@ -1,39 +0,0 @@ - -using Beam.Downloaders; -using Beam.Models; -using HtmlAgilityPack; -using Microsoft.Playwright; - -namespace Beam.Playwright { - public class PlaywrightUnitPageDownloader : UnitDownloader { - public PlaywrightAsyncManipulator PuppetManipulator { get; } - - public PlaywrightUnitPageDownloader(HtmlWeb web, PlaywrightAsyncManipulator puppetManipulator, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] asyncDownloadFailurePredicates) - : base(web, asyncHtmlTransformer, asyncDownloadFailurePredicates) { - PuppetManipulator = puppetManipulator; - } - - protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { - var page = await PlaywrightContext.Browser.Value.NewPageAsync(); - try { - await page.GotoAsync(link); - await PuppetManipulator(page); - var content = await page.ContentAsync(); - await page.CloseAsync(); - - HtmlDocument doc = new(); - doc.LoadHtml(content); - var transformed = await Transformer(doc); - if (FailurePredicates is null || !(await IsFailure(doc))) - return (true, transformed); - return (false, default); - } catch (Exception) { - return (false, default); - } finally { - if (!page.IsClosed) - await page.CloseAsync(); - } - } - } - -} diff --git a/Beam.Stealth/StealthFragmentDownloader.cs b/Beam.Stealth/StealthFragmentDownloader.cs index 20d0868..d471165 100644 --- a/Beam.Stealth/StealthFragmentDownloader.cs +++ b/Beam.Stealth/StealthFragmentDownloader.cs @@ -4,11 +4,12 @@ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; +using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; namespace Beam.Stealth { - public class StealthFragmentDownloader : UnitFragmentDownloaderBinary { - public StealthFragmentDownloader(HttpClient client, StealthConfig config, StealthAsyncManipulator manipulator, AsyncTransformer transformer, AsyncDownloadFailurePredicate?[]? failurePredicate = null, int fragmentSize = 4, ILogger? logger = null) : base(client, transformer, failurePredicate, fragmentSize, logger, new StealthUnitDownloader(client, config, manipulator, transformer, failurePredicate)) {} + public class StealthFragmentDownloader : UnitFragmentDownloader where RawType : IDocument { + public StealthFragmentDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader(options, config, manipulator)) {} } } diff --git a/Beam.Stealth/StealthFragmentPageDownloader.cs b/Beam.Stealth/StealthFragmentPageDownloader.cs index 6a33b52..9d931f5 100644 --- a/Beam.Stealth/StealthFragmentPageDownloader.cs +++ b/Beam.Stealth/StealthFragmentPageDownloader.cs @@ -5,11 +5,12 @@ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; +using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; namespace Beam.Stealth { - public class StealthFragmentPageDownloader : UnitFragmentDownloader { - public StealthFragmentPageDownloader(HtmlWeb web, StealthConfig config, StealthAsyncManipulator manipulator, AsyncTransformer transformer, AsyncDownloadFailurePredicate?[]? failurePredicate = null, int fragmentSize = 4, ILogger? logger = null) : base(web, transformer, failurePredicate, fragmentSize, logger, new StealthUnitPageDownloader(web, config, manipulator, transformer, failurePredicate)) {} + public class StealthFragmentPageDownloader : UnitFragmentDownloader where RawType : IDocument { + public StealthFragmentPageDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitPageDownloader(options, config, manipulator)) {} } } diff --git a/Beam.Stealth/StealthUnitDownloader.cs b/Beam.Stealth/StealthUnitDownloader.cs index fa2b2e9..5a5ac27 100644 --- a/Beam.Stealth/StealthUnitDownloader.cs +++ b/Beam.Stealth/StealthUnitDownloader.cs @@ -6,50 +6,37 @@ using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; +using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; namespace Beam.Stealth { using File = System.IO.File; - public class StealthUnitDownloader : UnitDownloaderBinary { + public class StealthUnitDownloader : UnitDownloader where RawType : IDocument { public StealthConfig Config { get; } public StealthAsyncManipulator Manipulator { get; } - private ILogger? Logger => Config.Logger; - public StealthUnitDownloader(HttpClient client, StealthConfig config, StealthAsyncManipulator manipulator, AsyncTransformer transformer, AsyncDownloadFailurePredicate?[]? failurePredicates = null) : base(client, transformer, failurePredicates) { + public StealthUnitDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { Config = config; Manipulator = manipulator; } - protected override async Task<(bool Success, T? Result)> TryDownloadWithNoRetries( - string link, CancellationToken ct) { - try { - Logger?.LogInformation("Navigating to {Link}", link); + protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, + IProgress progress, CancellationToken ct) { + var driver = Config.Driver; + await driver.Navigate().GoToUrlAsync(url); + await Manipulator(driver); - var driver = Config.Driver; - await driver.Navigate().GoToUrlAsync(link); - await Manipulator(driver); - - var sw = Stopwatch.StartNew(); - ByteDocument? doc = await WaitForDownloadAsync(link, sw, ct); - - if (doc is null || await IsFailure(doc)) - return (false, default); - - Logger?.LogInformation("Download finished in {Elapsed}", sw.Elapsed); - return (true, await Transformer(doc)); - } catch (Exception ex) { - Logger?.LogError(ex, "Error occurred downloading {Link}", link); - return (false, default); - } + await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), ct); + await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask); } /* --------------------------------------------------------------------- */ - private async Task WaitForDownloadAsync( - string link, Stopwatch sw, CancellationToken ct) { + private async Task WaitForDownloadAsync( + string link, IProgress progress, Stopwatch sw, CancellationToken ct) { const int PollDelayMs = 250; // how often we look const int StableDelayMs = 1000; // size-unchanged window @@ -80,6 +67,9 @@ namespace Beam.Stealth { // track growth long size = new FileInfo(finalPath).Length; if (size == 0 || size != lastSize) { + progress?.Report(new DownloadReport() { + BytesDownloaded = size - lastSize, + }); lastSize = size; lastChange = DateTime.UtcNow; await Task.Delay(PollDelayMs, ct); @@ -104,11 +94,7 @@ namespace Beam.Stealth { } } - byte[] bytes = await File.ReadAllBytesAsync(finalPath, ct); - Logger?.LogInformation("Download completed {Path} ({Size} bytes)", - finalPath, bytes.Length); - - return new ByteDocument(Path.GetFileName(finalPath), bytes); + return File.OpenRead(finalPath); } Logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed); diff --git a/Beam.Stealth/StealthUnitPageDownloader.cs b/Beam.Stealth/StealthUnitPageDownloader.cs index 294bee1..48d5e01 100644 --- a/Beam.Stealth/StealthUnitPageDownloader.cs +++ b/Beam.Stealth/StealthUnitPageDownloader.cs @@ -5,39 +5,29 @@ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; +using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; namespace Beam.Stealth { - public class StealthUnitPageDownloader : UnitDownloader { + public class StealthUnitPageDownloader : UnitDownloader where RawType : IDocument { public StealthConfig Config { get; } public StealthAsyncManipulator Manipulator { get; } - private ILogger? Logger => Config.Logger; - public StealthUnitPageDownloader(HtmlWeb web, StealthConfig config, StealthAsyncManipulator manipulator, AsyncTransformer transformer, AsyncDownloadFailurePredicate?[]? failurePredicate = null) : base(web, transformer, failurePredicate) { + public StealthUnitPageDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { Config = config; Manipulator = manipulator; } - protected async override Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { - try { - var driver = Config.Driver; + protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, CancellationToken ct) { + var driver = Config.Driver; - await driver.Navigate().GoToUrlAsync(link); - await Manipulator(driver); + await driver.Navigate().GoToUrlAsync(url); + await Manipulator(driver); - HtmlDocument doc = new(); - doc.LoadHtml(driver.PageSource); - - if (await IsFailure(doc)) - return (false, default); - - return (true, await Transformer(doc)); - } catch (Exception e) { - Logger?.LogError(e, "Error occurred downloading {}", link); - return (false, default); - } + byte[] bytes = Encoding.UTF8.GetBytes(driver.PageSource); + await destinationStream.WriteAsync(bytes, ct); } } } diff --git a/Beam/Beam.csproj b/Beam/Beam.csproj index 0733e2d..0f5bc4e 100644 --- a/Beam/Beam.csproj +++ b/Beam/Beam.csproj @@ -7,10 +7,10 @@ True - + - + diff --git a/aeqw89.Beam/aeqw89.Beam.csproj b/aeqw89.Beam/aeqw89.Beam.csproj index 9474961..b36423c 100644 --- a/aeqw89.Beam/aeqw89.Beam.csproj +++ b/aeqw89.Beam/aeqw89.Beam.csproj @@ -7,11 +7,12 @@ Beam aeqw89 qwsdcvghyu - 2.1.6 + 2.2.0 A library for downloading internet resources https://github.com/qwsdcvghyu89/Beam https://github.com/qwsdcvghyu89/Beam aeqw89.Beam + 2.2.0 @@ -32,7 +33,7 @@ all - + true @@ -53,7 +54,7 @@ true - + true @@ -101,10 +102,10 @@ lib\$(TargetFramework) true - - lib\$(TargetFramework)\ - true - + + + + lib\$(TargetFramework) true diff --git a/aeqw89.Beam/aeqw89.Beam.csproj.bak b/aeqw89.Beam/aeqw89.Beam.csproj.bak new file mode 100644 index 0000000..20f0e68 --- /dev/null +++ b/aeqw89.Beam/aeqw89.Beam.csproj.bak @@ -0,0 +1,130 @@ + + + + net9.0 + enable + enable + Beam + aeqw89 + qwsdcvghyu + 2.1.6 + A library for downloading internet resources + https://github.com/qwsdcvghyu89/Beam + https://github.com/qwsdcvghyu89/Beam + aeqw89.Beam + 2.1.6 + + + + all + + + all + + + all + + + all + + + + + + all + + + true + + + true + + + true + + + true + + + true + + + true + + + true + + + true + + + true + + + runtime; build; native; contentfiles; analyzers; buildtransitive + true + + + runtime; build; native; contentfiles; analyzers; buildtransitive + true + + + + + lib\$(TargetFramework)\ + true + + + lib\$(TargetFramework)\ + true + + + lib\$(TargetFramework)\ + true + + + lib\$(TargetFramework)\ + true + + + lib\$(TargetFramework)\ + true + + + + + + + lib\$(TargetFramework) + true + + + lib\$(TargetFramework) + true + + + + + + + lib\$(TargetFramework) + true + + + lib\$(TargetFramework) + true + + + lib\$(TargetFramework) + true + + + lib\$(TargetFramework) + true + + + lib\$(TargetFramework) + true + + + \ No newline at end of file