From f52aa6123bd3de03f1002cedec157f880fb6b992 Mon Sep 17 00:00:00 2001 From: qwsdcvghyu89 <61093706+qwsdcvghyu89@users.noreply.github.com> Date: Sat, 15 Nov 2025 22:51:46 +1100 Subject: [PATCH] Refactor downloaders to use ByteDocument and add options builders Replaces generic RawType with ByteDocument in downloaders and context classes, simplifying type usage. Adds builder classes for FailurePredicateOptions, FragmentOptions, SkipPredicateOptions, and UnitDownloaderOptions to improve configuration flexibility. Introduces DownloadTarget enum and SkipPredicate delegate for more granular download control. Refactors Fluent API interfaces and implementations to remove RawType generics and streamline usage. Adds Playwright and Stealth download strategies for extensibility. --- Beam.Downloaders/Beam.Downloaders.csproj | 1 + Beam.Downloaders/DownloadContext.cs | 6 +- Beam.Downloaders/DownloadContextBuilder.cs | 34 +-- Beam.Downloaders/FailurePredicateOptions.cs | 9 + .../FailurePredicateOptionsBuilder.cs | 56 +++++ Beam.Downloaders/FragmentOptions.cs | 7 + Beam.Downloaders/FragmentOptionsBuilder.cs | 36 +++ Beam.Downloaders/SequentialDownloader.cs | 6 +- .../SequentialFragmentDownloader.cs | 6 +- Beam.Downloaders/SkipPredicateOptions.cs | 48 ++++ Beam.Downloaders/UnitDownloader.cs | 70 ++++-- Beam.Downloaders/UnitDownloaderOptions.cs | 210 +++--------------- .../UnitDownloaderOptionsBuilder.cs | 107 +++++++++ Beam.Downloaders/UnitFragmentDownloader.cs | 8 +- Beam.Fluent/ContextStage.cs | 62 +++--- Beam.Fluent/Core/IContextStage.cs | 20 +- Beam.Fluent/Core/IDownloadStage.cs | 8 +- Beam.Fluent/Core/ITransformStage.cs | 4 +- Beam.Fluent/DownloadStage.cs | 8 +- Beam.Fluent/FluentDownload.cs | 14 +- Beam.Fluent/TransformStage.cs | 6 +- Beam.Models/DownloadTarget.cs | 28 +++ Beam.Models/SkipPredicate.cs | 5 + Beam.Playwright/PlaywrightUnitDownloader.cs | 37 ++- .../Strategies/IDownloadStrategy.cs | 9 + .../Strategies/PageDownloadStrategy.cs | 15 ++ .../Strategies/WaitingDownloadStrategy.cs | 25 +++ Beam.Stealth/StealthFragmentDownloader.cs | 4 +- Beam.Stealth/StealthFragmentPageDownloader.cs | 16 -- Beam.Stealth/StealthUnitDownloader.cs | 84 +------ Beam.Stealth/StealthUnitPageDownloader.cs | 33 --- Beam.Stealth/Strategies/IDownloadStrategy.cs | 9 + .../Strategies/PageDownloadStrategy.cs | 13 ++ .../Strategies/WaitingDownloadStrategy.cs | 83 +++++++ 34 files changed, 648 insertions(+), 439 deletions(-) create mode 100644 Beam.Downloaders/FailurePredicateOptions.cs create mode 100644 Beam.Downloaders/FailurePredicateOptionsBuilder.cs create mode 100644 Beam.Downloaders/FragmentOptions.cs create mode 100644 Beam.Downloaders/FragmentOptionsBuilder.cs create mode 100644 Beam.Downloaders/SkipPredicateOptions.cs create mode 100644 Beam.Downloaders/UnitDownloaderOptionsBuilder.cs create mode 100644 Beam.Models/DownloadTarget.cs create mode 100644 Beam.Models/SkipPredicate.cs create mode 100644 Beam.Playwright/Strategies/IDownloadStrategy.cs create mode 100644 Beam.Playwright/Strategies/PageDownloadStrategy.cs create mode 100644 Beam.Playwright/Strategies/WaitingDownloadStrategy.cs delete mode 100644 Beam.Stealth/StealthFragmentPageDownloader.cs delete mode 100644 Beam.Stealth/StealthUnitPageDownloader.cs create mode 100644 Beam.Stealth/Strategies/IDownloadStrategy.cs create mode 100644 Beam.Stealth/Strategies/PageDownloadStrategy.cs create mode 100644 Beam.Stealth/Strategies/WaitingDownloadStrategy.cs diff --git a/Beam.Downloaders/Beam.Downloaders.csproj b/Beam.Downloaders/Beam.Downloaders.csproj index ab69f30..1526f93 100644 --- a/Beam.Downloaders/Beam.Downloaders.csproj +++ b/Beam.Downloaders/Beam.Downloaders.csproj @@ -14,6 +14,7 @@ + diff --git a/Beam.Downloaders/DownloadContext.cs b/Beam.Downloaders/DownloadContext.cs index 96009af..59e0ab1 100644 --- a/Beam.Downloaders/DownloadContext.cs +++ b/Beam.Downloaders/DownloadContext.cs @@ -8,14 +8,14 @@ namespace Beam.Downloaders { //public delegate Task AsyncHtmlTransformer(HtmlDocument doc); //public delegate Task AsyncBinaryTransformer(byte[] bin); - public class DownloadContext { + public class DownloadContext { private bool disposedValue; public HttpClient Client { get; } public HtmlWeb Web { get; } public IProgress? DownloadReporter { get; set; } public IProgress? RetryReporter { get; set; } - public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; } + public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; } public TimeSpan TimeOut { get; set; } public IEnumerable Links { get; } public CancellationToken CancellationToken { get; } @@ -28,7 +28,7 @@ namespace Beam.Downloaders { CancellationToken cancellationToken = default, IProgress? downloadReporter = null, IProgress? retryReporter = null, - AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null, + AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null, TimeSpan? timeOut = null, ILogger? downloadLogger = null) { ArgumentNullException.ThrowIfNull(web, nameof(web)); diff --git a/Beam.Downloaders/DownloadContextBuilder.cs b/Beam.Downloaders/DownloadContextBuilder.cs index d92277a..41f5e13 100644 --- a/Beam.Downloaders/DownloadContextBuilder.cs +++ b/Beam.Downloaders/DownloadContextBuilder.cs @@ -5,12 +5,12 @@ using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class DownloadContextBuilder { + public class DownloadContextBuilder { private HtmlWeb _web; private HttpClient _client; private IProgress? _downloadReporter; private IProgress? _retryReporter; - private AsyncDownloadFailurePredicate?[] _asyncFailurePredicates = []; + private AsyncDownloadFailurePredicate?[] _asyncFailurePredicates = []; private TimeSpan _timeOut; private IEnumerable _links; private CancellationToken _cancellationToken; @@ -26,60 +26,60 @@ namespace Beam.Downloaders { _links = []; } - public DownloadContextBuilder WithWeb(HtmlWeb web) { + public DownloadContextBuilder WithWeb(HtmlWeb web) { _web = web; return this; } - public DownloadContextBuilder WithClient(HttpClient client) { + public DownloadContextBuilder WithClient(HttpClient client) { _client = client; return this; } - public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) { + public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) { _downloadReporter = downloadReporter; return this; } - public DownloadContextBuilder WithRetryReporter(IProgress retryReporter) { + public DownloadContextBuilder WithRetryReporter(IProgress retryReporter) { _retryReporter = retryReporter; return this; } - public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate[] predicates) { + public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate[] predicates) { _asyncFailurePredicates = predicates; return this; } - public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) { + public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) { _timeOut = timeOut; return this; } - public DownloadContextBuilder WithLinks(IEnumerable links) { + public DownloadContextBuilder WithLinks(IEnumerable links) { _links = links; return this; } - public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) { + public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) { _cancellationToken = cancellationToken; return this; } - public DownloadContextBuilder WithCache(DocumentCache cache) { + public DownloadContextBuilder WithCache(DocumentCache cache) { _cache = cache; return this; } - public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) { + public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) { _downloadLogger = downloadLogger; return this; } - public DownloadContext Build() { + public DownloadContext Build() { // Construct the DownloadContext using the collected values. - var context = new DownloadContext( + var context = new DownloadContext( web: _web, client: _client, links: _links, @@ -100,15 +100,15 @@ namespace Beam.Downloaders { return context; } - public static DownloadContextBuilder FromContext(DownloadContext existing) { + public static DownloadContextBuilder FromContext(DownloadContext existing) { if (existing == null) throw new ArgumentNullException(nameof(existing)); - return new DownloadContextBuilder(existing.Client, existing.Web) + return new DownloadContextBuilder(existing.Client, existing.Web) .WithLinks(existing.Links) .WithCancellationToken(existing.CancellationToken) .WithDownloadReporter(existing.DownloadReporter!) .WithRetryReporter(existing.RetryReporter!) - .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>()) + .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>()) .WithTimeOut(existing.TimeOut) .WithDownloadLogger(existing.DownloadLogger!) .WithCache(existing.Cache); diff --git a/Beam.Downloaders/FailurePredicateOptions.cs b/Beam.Downloaders/FailurePredicateOptions.cs new file mode 100644 index 0000000..e804577 --- /dev/null +++ b/Beam.Downloaders/FailurePredicateOptions.cs @@ -0,0 +1,9 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public record class FailurePredicateOptions { + public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; } + public bool ProcessInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} \ No newline at end of file diff --git a/Beam.Downloaders/FailurePredicateOptionsBuilder.cs b/Beam.Downloaders/FailurePredicateOptionsBuilder.cs new file mode 100644 index 0000000..d5ec367 --- /dev/null +++ b/Beam.Downloaders/FailurePredicateOptionsBuilder.cs @@ -0,0 +1,56 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public sealed class FailurePredicateOptionsBuilder +{ + private readonly System.Collections.Generic.List?> _predicates = + new System.Collections.Generic.List?>(); + private bool _processInParallel = false; + private int? _parallelThreads = null; + + public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate) + { + _predicates.Add(predicate); + return this; + } + + public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates) + { + if (predicates == null) throw new System.ArgumentNullException(nameof(predicates)); + _predicates.AddRange(predicates); + return this; + } + + public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates) + { + _predicates.Clear(); + if (predicates != null) _predicates.AddRange(predicates); + return this; + } + + public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true) + { + _processInParallel = value; + return this; + } + + public FailurePredicateOptionsBuilder WithParallelThreads(int? threads) + { + if (threads.HasValue && threads.Value <= 0) + throw new System.ArgumentOutOfRangeException(nameof(threads)); + _parallelThreads = threads; + return this; + } + + public FailurePredicateOptions Build() + { + var arr = _predicates.Count == 0 ? [] : _predicates.ToArray(); + return new FailurePredicateOptions + { + AsyncDownloadFailurePredicates = arr, + ProcessInParallel = _processInParallel, + ParallelThreads = _parallelThreads + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/FragmentOptions.cs b/Beam.Downloaders/FragmentOptions.cs new file mode 100644 index 0000000..a00c698 --- /dev/null +++ b/Beam.Downloaders/FragmentOptions.cs @@ -0,0 +1,7 @@ +namespace Beam.Downloaders; + +public record class FragmentOptions { + public required int FragmentSize { get; init; } + public bool DownloadInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} \ No newline at end of file diff --git a/Beam.Downloaders/FragmentOptionsBuilder.cs b/Beam.Downloaders/FragmentOptionsBuilder.cs new file mode 100644 index 0000000..49ea521 --- /dev/null +++ b/Beam.Downloaders/FragmentOptionsBuilder.cs @@ -0,0 +1,36 @@ +namespace Beam.Downloaders; + +public sealed class FragmentOptionsBuilder { + private int? _fragmentSize; + private bool _downloadInParallel = false; + private int? _parallelThreads = null; + + public FragmentOptionsBuilder WithFragmentSize(int bytes) { + if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); + _fragmentSize = bytes; + return this; + } + + public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) { + _downloadInParallel = value; + return this; + } + + public FragmentOptionsBuilder WithParallelThreads(int? threads) { + if (threads.HasValue && threads.Value <= 0) + throw new System.ArgumentOutOfRangeException(nameof(threads)); + _parallelThreads = threads; + return this; + } + + public FragmentOptions Build() { + if (!_fragmentSize.HasValue) + throw new System.InvalidOperationException("FragmentSize must be provided."); + + return new FragmentOptions { + FragmentSize = _fragmentSize.Value, + DownloadInParallel = _downloadInParallel, + ParallelThreads = _parallelThreads + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/SequentialDownloader.cs b/Beam.Downloaders/SequentialDownloader.cs index 378000b..2f7db22 100644 --- a/Beam.Downloaders/SequentialDownloader.cs +++ b/Beam.Downloaders/SequentialDownloader.cs @@ -3,9 +3,9 @@ using Beam.Models; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class SequentialDownloader : IAsyncEnumerator { + public class SequentialDownloader : IAsyncEnumerator { public OutType Current { get; protected set; } - public DownloadContext Context { get; } + public DownloadContext Context { get; } public ILogger? Logger { get; set; } public int LastOrder { get; set; } = 0; @@ -13,7 +13,7 @@ namespace Beam.Downloaders { public Func> GetUnitDownloader { get; set; } - public SequentialDownloader(DownloadContext context, Func, IUnitDownloader> getUnitDownloader, ILogger? logger = null) { + public SequentialDownloader(DownloadContext context, Func> getUnitDownloader, ILogger? logger = null) { Context = context; Logger = logger; LinksEnumerator = Context.Links.GetEnumerator(); diff --git a/Beam.Downloaders/SequentialFragmentDownloader.cs b/Beam.Downloaders/SequentialFragmentDownloader.cs index b4c40d5..a7a1274 100644 --- a/Beam.Downloaders/SequentialFragmentDownloader.cs +++ b/Beam.Downloaders/SequentialFragmentDownloader.cs @@ -3,10 +3,10 @@ using Beam.Models; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class SequentialFragmentDownloader : SequentialDownloader>> { + public class SequentialFragmentDownloader : SequentialDownloader>> { public SequentialFragmentDownloader( - DownloadContext context, - Func, IUnitDownloader>>> getUnitDownloader, + DownloadContext context, + Func>>> getUnitDownloader, ILogger? logger = null) : base(context, getUnitDownloader, logger) {} } diff --git a/Beam.Downloaders/SkipPredicateOptions.cs b/Beam.Downloaders/SkipPredicateOptions.cs new file mode 100644 index 0000000..2955cf7 --- /dev/null +++ b/Beam.Downloaders/SkipPredicateOptions.cs @@ -0,0 +1,48 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public class SkipPredicateOptions { + public required SkipPredicate?[]? SkipPredicates { get; init; } + public bool ProcessInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} + +public class SkipPredicateOptionsBuilder { + private List?> _skipPredicates { get; set; } = []; + private bool _processInParallel { get; set; } = false; + private int? _parallelThreads { get; set; } + + public SkipPredicateOptionsBuilder WithSkipPredicate(SkipPredicate predicate, bool replace=false) { + if (replace) + _skipPredicates.Clear(); + _skipPredicates.Add(predicate); + return this; + } + + public SkipPredicateOptionsBuilder WithSkipPredicates(SkipPredicate[] predicates, + bool replace = true) { + if (replace) + _skipPredicates.Clear(); + _skipPredicates.AddRange(predicates); + return this; + } + + public SkipPredicateOptionsBuilder ProcessInParallel(bool processInParallel = true) { + _processInParallel = processInParallel; + return this; + } + + public SkipPredicateOptionsBuilder WithParallelThreads(int parallelThreads) { + _parallelThreads = parallelThreads; + return this; + } + + public SkipPredicateOptions Build() { + return new SkipPredicateOptions() { + SkipPredicates = _skipPredicates.ToArray(), + ParallelThreads = _parallelThreads, + ProcessInParallel = _processInParallel + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/UnitDownloader.cs b/Beam.Downloaders/UnitDownloader.cs index 400f47a..cad1ccc 100644 --- a/Beam.Downloaders/UnitDownloader.cs +++ b/Beam.Downloaders/UnitDownloader.cs @@ -1,4 +1,6 @@ -using Beam.Abstractions; +using System.Diagnostics.CodeAnalysis; +using System.Text; +using Beam.Abstractions; using Beam.Models; using HtmlAgilityPack; using File = System.IO.File; @@ -11,12 +13,12 @@ namespace Beam.Downloaders { /// /// /// - public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader where RawType : IDocument { - public UnitDownloaderOptions Options { get; } = options; + public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader { + public UnitDownloaderOptions Options { get; } = options; public HttpClient Client => Options.Client; - public virtual AsyncTransformer Transformer => Options.AsyncTransformer; - - public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates => + public virtual AsyncTransformer Transformer => Options.AsyncTransformer; + + public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates => Options?.FailurePredicateOptions?.AsyncDownloadFailurePredicates; public int LinksPerDownload { get; } = 1; @@ -70,7 +72,7 @@ namespace Beam.Downloaders { return new ByteDocument(url, bytes); } - protected virtual async Task IsFailure(RawType doc, CancellationToken ct) { + protected virtual async Task IsFailure(ByteDocument doc, CancellationToken ct) { if (FailurePredicates is null) return false; if (!(Options?.FailurePredicateOptions?.ProcessInParallel ?? false)) @@ -103,19 +105,18 @@ namespace Beam.Downloaders { return false; } - protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) { - if (Options.DownloadFolder is not null && this is UnitDownloader) { - var path = Path.Combine(Options.DownloadFolder, Path.GetRandomFileName()); + protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) { + if (Options.DownloadFolder is not null) { + var path = Path.Combine(Options.DownloadFolder, options.GetFileNameForDownload(link, [])); await DownloadToFile(link, Options.BufferSize, path, progress, ct); - return (RawType)(object)new StringDocument(link, path); + return new ByteDocument(link, Encoding.UTF8.GetBytes(path)); } - if (this is UnitDownloader) { - return (RawType)(object)(await DownloadToMemory(link, Options.BufferSize, progress, ct)); + else { + return await DownloadToMemory(link, Options.BufferSize, progress, ct); } - throw new NotSupportedException(Exceptions.Exceptions.unit_downloader_limited_support); } - protected virtual async Task<(bool, OutType?)> Transform(RawType download, CancellationToken ct) { + protected virtual async Task<(bool, OutType?)> Transform(ByteDocument download, CancellationToken ct) { try { if (FailurePredicates is null || !(await IsFailure(download, ct))) return (true, await Transformer(download)); @@ -131,6 +132,9 @@ namespace Beam.Downloaders { return (false, default); downProgress ??= new Progress(); + + if (ShouldSkip(link[0].Data, out var defaultType)) + return (true, defaultType); OutType? ot = default; int tryCount = 0; @@ -147,5 +151,41 @@ namespace Beam.Downloaders { return (false, ot); } + + private bool ShouldSkip(string link, [NotNullWhen(true)] out OutType? outType) { + outType = default; + if (Options.SkipPredicateOptions?.SkipPredicates is null) + return false; + if (!Options.SkipPredicateOptions.ProcessInParallel) + foreach (var pred in Options.SkipPredicateOptions.SkipPredicates) { + if (pred is null) + continue; + if (pred(link, out outType)) + return true; + } + else { + var shouldSkip = false; + OutType? _outType = default; + Parallel.ForEach(Options.SkipPredicateOptions.SkipPredicates, new ParallelOptions() { + MaxDegreeOfParallelism = Options?.FailurePredicateOptions?.ParallelThreads ?? 4 + }, + (predicate, parallelLoopState) => { + if (parallelLoopState.ShouldExitCurrentIteration) + return; + if (predicate == null) + return; + if (predicate(link, out var _innerLoopOutType)) { + Interlocked.CompareExchange(ref shouldSkip, true, false); + Interlocked.CompareExchange(ref _outType, _innerLoopOutType, default); + parallelLoopState.Break(); + } + } + ); + outType = _outType; + return shouldSkip; + } + + return false; + } } } diff --git a/Beam.Downloaders/UnitDownloaderOptions.cs b/Beam.Downloaders/UnitDownloaderOptions.cs index dc99555..421c3b6 100644 --- a/Beam.Downloaders/UnitDownloaderOptions.cs +++ b/Beam.Downloaders/UnitDownloaderOptions.cs @@ -1,198 +1,38 @@ +using System.Diagnostics.CodeAnalysis; +using System.Security.Cryptography; +using System.Text; using Beam.Models; namespace Beam.Downloaders; -public record class UnitDownloaderOptions { +public record class UnitDownloaderOptions { public HttpClient Client { get; init; } = new(); + + public DownloadTarget Target { get; init; } = DownloadTarget.URL; - public FailurePredicateOptions? FailurePredicateOptions { get; init; } + public SkipPredicateOptions? SkipPredicateOptions { get; init; } + public FailurePredicateOptions? FailurePredicateOptions { get; init; } public FragmentOptions? FragmentOptions { get; init; } - public required AsyncTransformer AsyncTransformer { get; init; } + public required AsyncTransformer AsyncTransformer { get; init; } + + /// + /// The location where the download is stored. + /// + /// + /// If not defined, UnitDownloader.TryDownload() downloads to memory. + /// public string? DownloadFolder { get; init; } = null; public int BufferSize { get; init; } = 80 * 1024; // 80kb + + public string GetFileNameForDownload(string url, byte[] additionalData) { + byte[] bytes = [..Encoding.UTF8.GetBytes(url), ..additionalData]; + var name = Convert.ToBase64String(System.IO.Hashing.XxHash64.Hash(bytes)); + return name.Replace('+', '-').Replace('/', '_').Replace('=', ' ').Trim(); + } } -public record class FailurePredicateOptions { - public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; } - public bool ProcessInParallel { get; init; } = false; - public int? ParallelThreads { get; init; } -} +// ---------- UnitDownloaderOptions Builder ---------- -public record class FragmentOptions { - public required int FragmentSize { get; init; } - public bool DownloadInParallel { get; init; } = false; - public int? ParallelThreads { get; init; } -} +// ---------- FailurePredicateOptions Builder ---------- - - // ---------- UnitDownloaderOptions Builder ---------- - public sealed class UnitDownloaderOptionsBuilder - { - private HttpClient _client = new HttpClient(); - private FailurePredicateOptions? _failureOptions; - private FragmentOptions? _fragmentOptions; - private AsyncTransformer? _asyncTransformer; - private string? _downloadFolder = null; - private int _bufferSize = 80 * 1024; - - public UnitDownloaderOptionsBuilder WithClient(HttpClient client) - { - _client = client ?? throw new System.ArgumentNullException(nameof(client)); - return this; - } - - public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options) - { - _failureOptions = options; - return this; - } - - public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure) - { - if (configure == null) throw new System.ArgumentNullException(nameof(configure)); - var b = new FailurePredicateOptionsBuilder(); - configure(b); - _failureOptions = b.Build(); - return this; - } - - public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options) - { - _fragmentOptions = options; - return this; - } - - public UnitDownloaderOptionsBuilder WithFragments(System.Action configure) - { - if (configure == null) throw new System.ArgumentNullException(nameof(configure)); - var b = new FragmentOptionsBuilder(); - configure(b); - _fragmentOptions = b.Build(); - return this; - } - - public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer) - { - _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer)); - return this; - } - - public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder) - { - _downloadFolder = downloadFolder; - return this; - } - - public UnitDownloaderOptionsBuilder WithBufferSize(int bytes) - { - if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); - _bufferSize = bytes; - return this; - } - - public UnitDownloaderOptions Build() - { - if (_asyncTransformer == null) - throw new System.InvalidOperationException("AsyncTransformer must be provided."); - - return new UnitDownloaderOptions - { - Client = _client, - FailurePredicateOptions = _failureOptions, - FragmentOptions = _fragmentOptions, - AsyncTransformer = _asyncTransformer, - DownloadFolder = _downloadFolder, - BufferSize = _bufferSize - }; - } - } - - // ---------- FailurePredicateOptions Builder ---------- - public sealed class FailurePredicateOptionsBuilder - { - private readonly System.Collections.Generic.List?> _predicates = - new System.Collections.Generic.List?>(); - private bool _processInParallel = false; - private int? _parallelThreads = null; - - public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate) - { - _predicates.Add(predicate); - return this; - } - - public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates) - { - if (predicates == null) throw new System.ArgumentNullException(nameof(predicates)); - _predicates.AddRange(predicates); - return this; - } - - public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates) - { - _predicates.Clear(); - if (predicates != null) _predicates.AddRange(predicates); - return this; - } - - public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true) - { - _processInParallel = value; - return this; - } - - public FailurePredicateOptionsBuilder WithParallelThreads(int? threads) - { - if (threads.HasValue && threads.Value <= 0) - throw new System.ArgumentOutOfRangeException(nameof(threads)); - _parallelThreads = threads; - return this; - } - - public FailurePredicateOptions Build() - { - var arr = _predicates.Count == 0 ? [] : _predicates.ToArray(); - return new FailurePredicateOptions - { - AsyncDownloadFailurePredicates = arr, - ProcessInParallel = _processInParallel, - ParallelThreads = _parallelThreads - }; - } - } - - // ---------- FragmentOptions Builder ---------- - public sealed class FragmentOptionsBuilder { - private int? _fragmentSize; - private bool _downloadInParallel = false; - private int? _parallelThreads = null; - - public FragmentOptionsBuilder WithFragmentSize(int bytes) { - if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); - _fragmentSize = bytes; - return this; - } - - public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) { - _downloadInParallel = value; - return this; - } - - public FragmentOptionsBuilder WithParallelThreads(int? threads) { - if (threads.HasValue && threads.Value <= 0) - throw new System.ArgumentOutOfRangeException(nameof(threads)); - _parallelThreads = threads; - return this; - } - - public FragmentOptions Build() { - if (!_fragmentSize.HasValue) - throw new System.InvalidOperationException("FragmentSize must be provided."); - - return new FragmentOptions { - FragmentSize = _fragmentSize.Value, - DownloadInParallel = _downloadInParallel, - ParallelThreads = _parallelThreads - }; - } - } +// ---------- FragmentOptions Builder ---------- \ No newline at end of file diff --git a/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs b/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs new file mode 100644 index 0000000..4675092 --- /dev/null +++ b/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs @@ -0,0 +1,107 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public sealed class UnitDownloaderOptionsBuilder { + private DownloadTarget _target = DownloadTarget.URL; + private HttpClient _client = new HttpClient(); + private FailurePredicateOptionsBuilder _failureOptionsBuilder = new(); + private FailurePredicateOptions? _failurePredicateOptionsOverride = null; + private SkipPredicateOptionsBuilder _skipPredicateOptionsBuilder = new(); + private SkipPredicateOptions? _skipPredicateOptionsOverride = null; + private FragmentOptions? _fragmentOptions; + private AsyncTransformer? _asyncTransformer; + private string? _downloadFolder = null; + private int _bufferSize = 80 * 1024; + + public UnitDownloaderOptionsBuilder WithTarget(DownloadTarget target) { + _target = target; + return this; + } + + public UnitDownloaderOptionsBuilder WithClient(HttpClient client) + { + _client = client ?? throw new System.ArgumentNullException(nameof(client)); + return this; + } + + public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options) + { + _failurePredicateOptionsOverride = options; + return this; + } + + public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure) + { + if (configure == null) throw new System.ArgumentNullException(nameof(configure)); + configure(_failureOptionsBuilder); + return this; + } + + public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options) + { + _fragmentOptions = options; + return this; + } + + public UnitDownloaderOptionsBuilder WithSkipPredicates(Action> configure) { + if (configure == null) throw new ArgumentNullException(nameof(configure)); + configure(_skipPredicateOptionsBuilder); + return this; + } + + public UnitDownloaderOptionsBuilder WithSkipPredicateOptions( + SkipPredicateOptions skipPredicateOptions) { + _skipPredicateOptionsOverride = skipPredicateOptions; + return this; + } + + public UnitDownloaderOptionsBuilder WithFragments(System.Action configure) + { + if (configure == null) throw new System.ArgumentNullException(nameof(configure)); + var b = new FragmentOptionsBuilder(); + configure(b); + _fragmentOptions = b.Build(); + return this; + } + + public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer) + { + _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer)); + return this; + } + + public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder) + { + _downloadFolder = downloadFolder; + return this; + } + + public UnitDownloaderOptionsBuilder WithBufferSize(int bytes) + { + if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); + _bufferSize = bytes; + return this; + } + + public UnitDownloaderOptions Build() + { + if (_asyncTransformer == null) + throw new System.InvalidOperationException("AsyncTransformer must be provided."); + + _failurePredicateOptionsOverride ??= _failureOptionsBuilder.Build(); + _skipPredicateOptionsOverride ??= _skipPredicateOptionsBuilder.Build(); + + return new UnitDownloaderOptions + { + Target = _target, + Client = _client, + FailurePredicateOptions = _failurePredicateOptionsOverride, + SkipPredicateOptions = _skipPredicateOptionsOverride, + FragmentOptions = _fragmentOptions, + AsyncTransformer = _asyncTransformer, + DownloadFolder = _downloadFolder, + BufferSize = _bufferSize + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/UnitFragmentDownloader.cs b/Beam.Downloaders/UnitFragmentDownloader.cs index a399128..017fce4 100644 --- a/Beam.Downloaders/UnitFragmentDownloader.cs +++ b/Beam.Downloaders/UnitFragmentDownloader.cs @@ -5,12 +5,12 @@ using HtmlAgilityPack; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class UnitFragmentDownloader(UnitDownloaderOptions options, - IUnitDownloader? internalDownloader = null) : IUnitDownloader>> where RawType : IDocument { + public class UnitFragmentDownloader(UnitDownloaderOptions options, + IUnitDownloader? internalDownloader = null) : IUnitDownloader>> { - public UnitDownloaderOptions Options { get; } = options; + public UnitDownloaderOptions Options { get; } = options; public int LinksPerDownload { get; set; } - private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options); + private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options); async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount, IProgress? downProgress, IProgress? tryProgress) { Fragment> fragment = new Fragment>(link.Length); diff --git a/Beam.Fluent/ContextStage.cs b/Beam.Fluent/ContextStage.cs index c387c04..1d86a7f 100644 --- a/Beam.Fluent/ContextStage.cs +++ b/Beam.Fluent/ContextStage.cs @@ -8,14 +8,14 @@ using Beam.Downloaders; namespace Beam.Fluent; -internal sealed class ContextStage : IContextStage where RawType : IDocument { - private readonly DownloadContextBuilder _ctxBuilder; - private readonly AsyncTransformer _transformer; +internal sealed class ContextStage : IContextStage { + private readonly DownloadContextBuilder _ctxBuilder; + private readonly AsyncTransformer _transformer; private FragmentMode _fragmentMode = FragmentMode.Single; private Channel _channel = Channel.Plain; private readonly ContentKind _contentKind; private int _parallelism = 4; - private UnitDownloaderOptionsBuilder _optionsBuilder = new(); + private UnitDownloaderOptionsBuilder _optionsBuilder = new(); // ──────────────── playwright ──────────────── private PlaywrightAsyncManipulator? _playwrightManipulator = null; @@ -27,8 +27,8 @@ internal sealed class ContextStage : IContextStage ctxBuilder, - AsyncTransformer transformer) { + public ContextStage(DownloadContextBuilder ctxBuilder, + AsyncTransformer transformer) { _ctxBuilder = ctxBuilder; _transformer = transformer; _contentKind = transformer switch { @@ -43,28 +43,28 @@ internal sealed class ContextStage : IContextStage Configure(Action> configure) { + public IContextStage Configure(Action configure) { configure(_ctxBuilder); return this; } - public IContextStage ConfigureUnitDownloaderOptions( - Action> configure) { + public IContextStage ConfigureUnitDownloaderOptions( + Action> configure) { configure(_optionsBuilder); return this; } - public IContextStage WithParallelism(int degree) { + public IContextStage< OutType> WithParallelism(int degree) { _parallelism = Math.Max(1, degree); return this; } - public IContextStage WithTimeout(TimeSpan timeout) { + public IContextStage< OutType> WithTimeout(TimeSpan timeout) { _ctxBuilder.WithTimeOut(timeout); return this; } - public IContextStage WithRetryReporter(IProgress reporter) { + public IContextStage< OutType> WithRetryReporter(IProgress reporter) { _ctxBuilder.WithRetryReporter(reporter); return this; } @@ -73,7 +73,7 @@ internal sealed class ContextStage : IContextStage /// /// - public IContextStage UseFragments() { + public IContextStage< OutType> UseFragments() { if (_playwrightManipulator is not null) _playwrightManipulator = null; if (_channel == Channel.Playwright) @@ -88,7 +88,7 @@ internal sealed class ContextStage : IContextStage /// The page manipulator /// - public IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator) { + public IContextStage< OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) { if (_fragmentMode == FragmentMode.Fragmented) _fragmentMode = FragmentMode.Single; if (_stealthManipulator is not null) @@ -99,7 +99,7 @@ internal sealed class ContextStage : IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) { + public IContextStage< OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) { if (_playwrightManipulator is not null) _playwrightManipulator = null; @@ -109,7 +109,7 @@ internal sealed class ContextStage : IContextStage context) { + private object ConstructUnitDownloader(DownloadContext context) { #region Utility functions T To(object? o) where T : class @@ -145,27 +145,19 @@ internal sealed class ContextStage : IContextStage new UnitFragmentDownloader(options), + => new UnitFragmentDownloader< OutType>(options), // ──────────────── single ──────────────── (Channel.Plain, FragmentMode.Single, _) - => new UnitDownloader(options), + => new UnitDownloader< OutType>(options), // ──────────────── single playwright ──────────────── (Channel.Playwright, FragmentMode.Single, _) - => new PlaywrightUnitDownloader(options, EnsureExists(_playwrightManipulator)), - // ──────────────── single stealth file ──────────────── - (Channel.Stealth, FragmentMode.Single, ContentKind.File) - => new StealthUnitPageDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), - // ──────────────── single stealth binary ──────────────── + => new PlaywrightUnitDownloader< OutType>(options, EnsureExists(_playwrightManipulator)), + // ──────────────── single stealth ──────────────── (Channel.Stealth, FragmentMode.Single, ContentKind.Binary) - => new StealthUnitDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), - // ──────────────── fragment stealth file ──────────────── - (Channel.Stealth, FragmentMode.Fragmented, ContentKind.File) - => new StealthFragmentPageDownloader(options, - EnsureExists(_stealthConfig), - EnsureExists(_stealthManipulator)), - // ──────────────── fragment stealth binary ──────────────── + => new StealthUnitDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), + // ──────────────── fragment stealth ──────────────── (Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary) - => new StealthFragmentDownloader(options, + => new StealthFragmentDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), _ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern, @@ -173,14 +165,14 @@ internal sealed class ContextStage : IContextStage> ConstructDownloader(DownloadContext context) { - var copyOfContext = DownloadContextBuilder.FromContext(context).Build(); + private IAsyncEnumerator> ConstructDownloader(DownloadContext context) { + var copyOfContext = DownloadContextBuilder.FromContext(context).Build(); return _fragmentMode switch { - FragmentMode.Fragmented => new SequentialFragmentDownloader( + FragmentMode.Fragmented => new SequentialFragmentDownloader( copyOfContext, ctx => (IUnitDownloader>>)ConstructUnitDownloader(ctx), context.DownloadLogger).UnwrapFragmented(), - FragmentMode.Single => new SequentialDownloader( + FragmentMode.Single => new SequentialDownloader< OutType>( copyOfContext, ctx => (IUnitDownloader)ConstructUnitDownloader(ctx), context.DownloadLogger).WrapOrdered(), diff --git a/Beam.Fluent/Core/IContextStage.cs b/Beam.Fluent/Core/IContextStage.cs index 5ad4486..2be7bf0 100644 --- a/Beam.Fluent/Core/IContextStage.cs +++ b/Beam.Fluent/Core/IContextStage.cs @@ -6,15 +6,15 @@ using Beam.Stealth; namespace Beam.Fluent; -public interface IContextStage { - IContextStage Configure(Action> configure); - IContextStage WithParallelism(int degree); - IContextStage WithTimeout(TimeSpan timeout); - IContextStage WithRetryReporter(IProgress reporter); - IContextStage UseFragments(); - IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator); - IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config); - IContextStage ConfigureUnitDownloaderOptions( - Action> configure); +public interface IContextStage { + IContextStage Configure(Action configure); + IContextStage WithParallelism(int degree); + IContextStage WithTimeout(TimeSpan timeout); + IContextStage WithRetryReporter(IProgress reporter); + IContextStage UseFragments(); + IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator); + IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config); + IContextStage ConfigureUnitDownloaderOptions( + Action> configure); DownloadEnumerable Build(); } \ No newline at end of file diff --git a/Beam.Fluent/Core/IDownloadStage.cs b/Beam.Fluent/Core/IDownloadStage.cs index 4c2c6bc..bfd3543 100644 --- a/Beam.Fluent/Core/IDownloadStage.cs +++ b/Beam.Fluent/Core/IDownloadStage.cs @@ -2,10 +2,10 @@ namespace Beam.Fluent; -public interface IDownloadStage { - IDownloadStage SaveToDirectory(string dir); - IDownloadStage SaveToFiles(IEnumerable files); - IDownloadStage SaveToMemory(ConcurrentBag bag); +public interface IDownloadStage { + IDownloadStage SaveToDirectory(string dir); + IDownloadStage SaveToFiles(IEnumerable files); + IDownloadStage SaveToMemory(ConcurrentBag bag); void WaitForDownload(); Task WaitForDownloadAsync(); DownloadEnumerable AsAsyncEnumerable(); diff --git a/Beam.Fluent/Core/ITransformStage.cs b/Beam.Fluent/Core/ITransformStage.cs index 34e2ae5..2fd6637 100644 --- a/Beam.Fluent/Core/ITransformStage.cs +++ b/Beam.Fluent/Core/ITransformStage.cs @@ -3,6 +3,6 @@ using Beam.Models; namespace Beam.Fluent; -public interface ITransformStage { - IContextStage WithTransformer(AsyncTransformer factory); +public interface ITransformStage { + IContextStage WithTransformer(AsyncTransformer factory); } \ No newline at end of file diff --git a/Beam.Fluent/DownloadStage.cs b/Beam.Fluent/DownloadStage.cs index 4b3a896..43a1b3c 100644 --- a/Beam.Fluent/DownloadStage.cs +++ b/Beam.Fluent/DownloadStage.cs @@ -5,7 +5,7 @@ using Beam.Models; namespace Beam.Fluent; -internal sealed class DownloadStage(DownloadEnumerable download) : IDownloadStage where RawType : IDocument { +internal sealed class DownloadStage(DownloadEnumerable download) : IDownloadStage { private IAsyncEnumerable> _download = download; public DownloadEnumerable AsAsyncEnumerable() { @@ -22,16 +22,16 @@ internal sealed class DownloadStage(DownloadEnumerable SaveToDirectory(string dir) { + public IDownloadStage SaveToDirectory(string dir) { _download = _SaveToDirectory(dir); return this; } - public IDownloadStage SaveToFiles(IEnumerable files) { + public IDownloadStage SaveToFiles(IEnumerable files) { throw new NotImplementedException(); } - public IDownloadStage SaveToMemory(ConcurrentBag bag) { + public IDownloadStage SaveToMemory(ConcurrentBag bag) { throw new NotImplementedException(); } diff --git a/Beam.Fluent/FluentDownload.cs b/Beam.Fluent/FluentDownload.cs index bce657a..b092d4d 100644 --- a/Beam.Fluent/FluentDownload.cs +++ b/Beam.Fluent/FluentDownload.cs @@ -8,22 +8,22 @@ using Beam.Models; namespace Beam.Fluent; public static class FluentDownload { - public static ITransformStage Links(params IEnumerable links) where RawType : IDocument { - return new TransformStage(new DownloadContextBuilder() + public static ITransformStage Links(params IEnumerable links) { + return new TransformStage(new DownloadContextBuilder() .WithLinks(links)); } - public static ITransformStage - ResourceDefinition(ResourceDefinition definition) where RawType : IDocument { + public static ITransformStage< OutType> + ResourceDefinition< OutType>(ResourceDefinition definition) { if (definition.Location.States.Count == 0) throw new ArgumentException(Exceptions.Exceptions.resource_definition_invalid_states_count, nameof(definition)); var linkGenerator = new OrderedLinkGenerator(definition.Location.Segments, (NumberedStateChanger)definition.Location.StateChanger.Behavior, definition.Location.States.First().Copy()); - return new TransformStage(new DownloadContextBuilder() + return new TransformStage< OutType>(new DownloadContextBuilder() .WithLinks(StringEnumerable.FromGenerator(linkGenerator!))); } - public static ITransformStage FromContext(DownloadContext existing) where RawType : IDocument { - return new TransformStage(DownloadContextBuilder.FromContext(existing)); + public static ITransformStage< OutType> FromContext< OutType>(DownloadContext existing) { + return new TransformStage< OutType>(DownloadContextBuilder.FromContext(existing)); } } \ No newline at end of file diff --git a/Beam.Fluent/TransformStage.cs b/Beam.Fluent/TransformStage.cs index 953ee14..1e35b3b 100644 --- a/Beam.Fluent/TransformStage.cs +++ b/Beam.Fluent/TransformStage.cs @@ -6,8 +6,8 @@ using Beam.Models; namespace Beam.Fluent; -internal sealed class TransformStage(DownloadContextBuilder CtxBuilder) : ITransformStage where RawType : IDocument { - public IContextStage WithTransformer(AsyncTransformer transformer) { - return new ContextStage(CtxBuilder, transformer); +internal sealed class TransformStage(DownloadContextBuilder CtxBuilder) : ITransformStage { + public IContextStage WithTransformer(AsyncTransformer transformer) { + return new ContextStage(CtxBuilder, transformer); } } \ No newline at end of file diff --git a/Beam.Models/DownloadTarget.cs b/Beam.Models/DownloadTarget.cs new file mode 100644 index 0000000..adaf393 --- /dev/null +++ b/Beam.Models/DownloadTarget.cs @@ -0,0 +1,28 @@ +namespace Beam.Models; + +public enum DownloadTarget { + /// + /// Specifies the target as the object directly returned through accessing the URL (whole page). + /// + /// + /// Default to this mode where possible. + /// + URL, + /// + /// Specifies the target as an object accessible only through the url (element in page). + /// + /// + /// Only use this mode if what is needed + /// cannot be acquired by using + /// + InURL, + /// + /// Specifies the target as an object that may be retrieved through a user-defined operation on the url + /// (e.g. javascript triggered downloads). + /// + /// + /// Only use this mode if what is needed cannot be acquired by either + /// or + /// + Complex +} \ No newline at end of file diff --git a/Beam.Models/SkipPredicate.cs b/Beam.Models/SkipPredicate.cs new file mode 100644 index 0000000..d1a4144 --- /dev/null +++ b/Beam.Models/SkipPredicate.cs @@ -0,0 +1,5 @@ +using System.Diagnostics.CodeAnalysis; + +namespace Beam.Models; + +public delegate bool SkipPredicate(string link, [NotNullWhen(true)] out T defaultValue); diff --git a/Beam.Playwright/PlaywrightUnitDownloader.cs b/Beam.Playwright/PlaywrightUnitDownloader.cs index 8e203e8..e2cdba2 100644 --- a/Beam.Playwright/PlaywrightUnitDownloader.cs +++ b/Beam.Playwright/PlaywrightUnitDownloader.cs @@ -1,36 +1,31 @@ using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; +using Beam.Playwright.Strategies; using Microsoft.Playwright; namespace Beam.Playwright { - public class PlaywrightUnitDownloader( - UnitDownloaderOptions options, - PlaywrightAsyncManipulator puppetManipulator) - : UnitDownloader(options) - where RawType : IDocument { - public PlaywrightAsyncManipulator PuppetManipulator { get; } = puppetManipulator; + public class PlaywrightUnitDownloader : UnitDownloader { + public PlaywrightUnitDownloader(UnitDownloaderOptions options, + PlaywrightAsyncManipulator puppetManipulator) : base(options) { + PuppetManipulator = puppetManipulator; + _downloadStrategy = options.Target switch { + DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(), + DownloadTarget.Complex => new WaitingDownloadStrategy(), + _ => throw new NotSupportedException() // TODO add an exception message + }; + } + + public PlaywrightAsyncManipulator PuppetManipulator { get; } + private IDownloadStrategy _downloadStrategy { get; } + protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, CancellationToken ct) { var page = await PlaywrightContext.Browser.Value.NewPageAsync(); try { await page.GotoAsync(url); await PuppetManipulator(page); - var download = await page.WaitForDownloadAsync(); - - await using var stream = await download.CreateReadStreamAsync(); - var buffer = new byte[bufferSize]; - var inBuffer = 0; - var downloaded = 0; - while ((inBuffer = stream.Read(buffer)) > 0) { - downloaded += inBuffer; - progress?.Report(new DownloadReport() { - BytesDownloaded = downloaded, - BytesRemaining = stream.Length - downloaded - }); - await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct); - } - + await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, page, ct); } finally { if (!page.IsClosed) await page.CloseAsync(); diff --git a/Beam.Playwright/Strategies/IDownloadStrategy.cs b/Beam.Playwright/Strategies/IDownloadStrategy.cs new file mode 100644 index 0000000..e3c645d --- /dev/null +++ b/Beam.Playwright/Strategies/IDownloadStrategy.cs @@ -0,0 +1,9 @@ +using Beam.Abstractions; +using Microsoft.Playwright; + +namespace Beam.Playwright.Strategies; + +internal interface IDownloadStrategy { + Task DownloadToStream(string url, int bufferSize, Stream destinationStream, + IProgress progress, IPage page, CancellationToken ct); +} \ No newline at end of file diff --git a/Beam.Playwright/Strategies/PageDownloadStrategy.cs b/Beam.Playwright/Strategies/PageDownloadStrategy.cs new file mode 100644 index 0000000..d975407 --- /dev/null +++ b/Beam.Playwright/Strategies/PageDownloadStrategy.cs @@ -0,0 +1,15 @@ +using System.Text; +using Beam.Abstractions; +using Microsoft.Playwright; + +namespace Beam.Playwright.Strategies; + +internal class PageDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, IPage page, + CancellationToken ct) { + + var source = await page.InnerHTMLAsync("html", new PageInnerHTMLOptions() { Strict = false }); + var bytes = Encoding.UTF8.GetBytes(source); + await destinationStream.WriteAsync(bytes, ct); + } +} \ No newline at end of file diff --git a/Beam.Playwright/Strategies/WaitingDownloadStrategy.cs b/Beam.Playwright/Strategies/WaitingDownloadStrategy.cs new file mode 100644 index 0000000..55c901b --- /dev/null +++ b/Beam.Playwright/Strategies/WaitingDownloadStrategy.cs @@ -0,0 +1,25 @@ +using Beam.Abstractions; +using Beam.Models; +using Microsoft.Playwright; + +namespace Beam.Playwright.Strategies; + +internal class WaitingDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, IPage page, + CancellationToken ct) { + + var download = await page.WaitForDownloadAsync(); + await using var stream = await download.CreateReadStreamAsync(); + var buffer = new byte[bufferSize]; + var inBuffer = 0; + var downloaded = 0; + while ((inBuffer = stream.Read(buffer)) > 0) { + downloaded += inBuffer; + progress?.Report(new DownloadReport() { + BytesDownloaded = downloaded, + BytesRemaining = stream.Length - downloaded + }); + await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct); + } + } +} \ No newline at end of file diff --git a/Beam.Stealth/StealthFragmentDownloader.cs b/Beam.Stealth/StealthFragmentDownloader.cs index d471165..cd2b29b 100644 --- a/Beam.Stealth/StealthFragmentDownloader.cs +++ b/Beam.Stealth/StealthFragmentDownloader.cs @@ -9,7 +9,7 @@ using Beam.Downloaders; using Beam.Models; namespace Beam.Stealth { - public class StealthFragmentDownloader : UnitFragmentDownloader where RawType : IDocument { - public StealthFragmentDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader(options, config, manipulator)) {} + public class StealthFragmentDownloader : UnitFragmentDownloader { + public StealthFragmentDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader(options, config, manipulator)) {} } } diff --git a/Beam.Stealth/StealthFragmentPageDownloader.cs b/Beam.Stealth/StealthFragmentPageDownloader.cs deleted file mode 100644 index 9d931f5..0000000 --- a/Beam.Stealth/StealthFragmentPageDownloader.cs +++ /dev/null @@ -1,16 +0,0 @@ -using HtmlAgilityPack; -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Beam.Abstractions; -using Beam.Downloaders; -using Beam.Models; - -namespace Beam.Stealth { - public class StealthFragmentPageDownloader : UnitFragmentDownloader where RawType : IDocument { - public StealthFragmentPageDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitPageDownloader(options, config, manipulator)) {} - } -} diff --git a/Beam.Stealth/StealthUnitDownloader.cs b/Beam.Stealth/StealthUnitDownloader.cs index 5a5ac27..b489c41 100644 --- a/Beam.Stealth/StealthUnitDownloader.cs +++ b/Beam.Stealth/StealthUnitDownloader.cs @@ -9,18 +9,27 @@ using System.Threading.Tasks; using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; +using Beam.Stealth.Strategies; namespace Beam.Stealth { using File = System.IO.File; - public class StealthUnitDownloader : UnitDownloader where RawType : IDocument { + public class StealthUnitDownloader : UnitDownloader { public StealthConfig Config { get; } public StealthAsyncManipulator Manipulator { get; } private ILogger? Logger => Config.Logger; - public StealthUnitDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { + private IDownloadStrategy _downloadStrategy { get; } + + public StealthUnitDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { Config = config; Manipulator = manipulator; + + _downloadStrategy = options.Target switch { + DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(), + DownloadTarget.Complex => new WaitingDownloadStrategy(), + _ => throw new NotSupportedException() // TODO add an exception message + }; } protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, @@ -29,76 +38,7 @@ namespace Beam.Stealth { await driver.Navigate().GoToUrlAsync(url); await Manipulator(driver); - await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), ct); - await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask); - } - - /* --------------------------------------------------------------------- */ - - private async Task WaitForDownloadAsync( - string link, IProgress progress, Stopwatch sw, CancellationToken ct) { - const int PollDelayMs = 250; // how often we look - const int StableDelayMs = 1000; // size-unchanged window - - string dir = Config.DownloadsDirectory; - string? finalPath = null; - long lastSize = -1; - DateTime lastChange = DateTime.UtcNow; - - bool IsTemp(string p) => - p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) || - p.EndsWith(".part", StringComparison.OrdinalIgnoreCase); - - Logger?.LogDebug("Polling {Dir} for download files", dir); - - while (sw.Elapsed < Config.TimeOut && !ct.IsCancellationRequested) { - // current files in the directory - var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray(); - - // ignore temp names; pick (or re-pick) the first real candidate - finalPath ??= files.FirstOrDefault(f => !IsTemp(f)); - - // still nothing but temps – keep waiting - if (finalPath is null) { - await Task.Delay(PollDelayMs, ct); - continue; - } - - // track growth - long size = new FileInfo(finalPath).Length; - if (size == 0 || size != lastSize) { - progress?.Report(new DownloadReport() { - BytesDownloaded = size - lastSize, - }); - lastSize = size; - lastChange = DateTime.UtcNow; - await Task.Delay(PollDelayMs, ct); - continue; - } - - // size stable long enough *and* no temp files left? - bool tempsRemain = files.Any(IsTemp); - if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) { - await Task.Delay(PollDelayMs, ct); - continue; - } - - // wait until writer releases lock - while (true) { - try { - using FileStream _ = - File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None); - break; - } catch (IOException) { - await Task.Delay(200, ct); - } - } - - return File.OpenRead(finalPath); - } - - Logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed); - return null; + await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, Config, Logger, ct); } diff --git a/Beam.Stealth/StealthUnitPageDownloader.cs b/Beam.Stealth/StealthUnitPageDownloader.cs deleted file mode 100644 index 48d5e01..0000000 --- a/Beam.Stealth/StealthUnitPageDownloader.cs +++ /dev/null @@ -1,33 +0,0 @@ -using HtmlAgilityPack; -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Beam.Abstractions; -using Beam.Downloaders; -using Beam.Models; - -namespace Beam.Stealth { - public class StealthUnitPageDownloader : UnitDownloader where RawType : IDocument { - public StealthConfig Config { get; } - public StealthAsyncManipulator Manipulator { get; } - private ILogger? Logger => Config.Logger; - - public StealthUnitPageDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { - Config = config; - Manipulator = manipulator; - } - - protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, CancellationToken ct) { - var driver = Config.Driver; - - await driver.Navigate().GoToUrlAsync(url); - await Manipulator(driver); - - byte[] bytes = Encoding.UTF8.GetBytes(driver.PageSource); - await destinationStream.WriteAsync(bytes, ct); - } - } -} diff --git a/Beam.Stealth/Strategies/IDownloadStrategy.cs b/Beam.Stealth/Strategies/IDownloadStrategy.cs new file mode 100644 index 0000000..e0b6ff7 --- /dev/null +++ b/Beam.Stealth/Strategies/IDownloadStrategy.cs @@ -0,0 +1,9 @@ +using Beam.Abstractions; +using Microsoft.Extensions.Logging; + +namespace Beam.Stealth.Strategies; + +internal interface IDownloadStrategy { + Task DownloadToStream(string url, int bufferSize, Stream destinationStream, + IProgress progress, StealthConfig config, ILogger? logger, CancellationToken ct); +} \ No newline at end of file diff --git a/Beam.Stealth/Strategies/PageDownloadStrategy.cs b/Beam.Stealth/Strategies/PageDownloadStrategy.cs new file mode 100644 index 0000000..b047d0b --- /dev/null +++ b/Beam.Stealth/Strategies/PageDownloadStrategy.cs @@ -0,0 +1,13 @@ +using System.Text; +using Beam.Abstractions; +using Microsoft.Extensions.Logging; + +namespace Beam.Stealth.Strategies; + +internal class PageDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, StealthConfig config, + ILogger? logger, CancellationToken ct) { + byte[] bytes = Encoding.UTF8.GetBytes(config.Driver.PageSource); + await destinationStream.WriteAsync(bytes, ct); + } +} \ No newline at end of file diff --git a/Beam.Stealth/Strategies/WaitingDownloadStrategy.cs b/Beam.Stealth/Strategies/WaitingDownloadStrategy.cs new file mode 100644 index 0000000..4c676ce --- /dev/null +++ b/Beam.Stealth/Strategies/WaitingDownloadStrategy.cs @@ -0,0 +1,83 @@ +using System.Diagnostics; +using Beam.Abstractions; +using Beam.Models; +using Microsoft.Extensions.Logging; +using File = System.IO.File; + +namespace Beam.Stealth.Strategies; + +public class WaitingDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, StealthConfig config, + ILogger? logger, CancellationToken ct) { + await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), config, logger, ct); + await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask); + } + + + private async Task WaitForDownloadAsync( + string link, IProgress progress, Stopwatch sw, StealthConfig config, ILogger? logger, CancellationToken ct) { + const int PollDelayMs = 250; // how often we look + const int StableDelayMs = 1000; // size-unchanged window + + string dir = config.DownloadsDirectory; + string? finalPath = null; + long lastSize = -1; + DateTime lastChange = DateTime.UtcNow; + + bool IsTemp(string p) => + p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) || + p.EndsWith(".part", StringComparison.OrdinalIgnoreCase); + + logger?.LogDebug("Polling {Dir} for download files", dir); + + while (sw.Elapsed < config.TimeOut && !ct.IsCancellationRequested) { + // current files in the directory + var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray(); + + // ignore temp names; pick (or re-pick) the first real candidate + finalPath ??= files.FirstOrDefault(f => !IsTemp(f)); + + // still nothing but temps – keep waiting + if (finalPath is null) { + await Task.Delay(PollDelayMs, ct); + continue; + } + + // track growth + long size = new FileInfo(finalPath).Length; + if (size == 0 || size != lastSize) { + progress?.Report(new DownloadReport() { + BytesDownloaded = size - lastSize, + }); + lastSize = size; + lastChange = DateTime.UtcNow; + await Task.Delay(PollDelayMs, ct); + continue; + } + + // size stable long enough *and* no temp files left? + bool tempsRemain = files.Any(IsTemp); + if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) { + await Task.Delay(PollDelayMs, ct); + continue; + } + + // wait until writer releases lock + while (true) { + try { + using FileStream _ = + File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None); + break; + } catch (IOException) { + await Task.Delay(200, ct); + } + } + + return File.OpenRead(finalPath); + } + + logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed); + return null; + } + +} \ No newline at end of file