From f52aa6123bd3de03f1002cedec157f880fb6b992 Mon Sep 17 00:00:00 2001
From: qwsdcvghyu89 <61093706+qwsdcvghyu89@users.noreply.github.com>
Date: Sat, 15 Nov 2025 22:51:46 +1100
Subject: [PATCH] Refactor downloaders to use ByteDocument and add options
builders
Replaces generic RawType with ByteDocument in downloaders and context classes, simplifying type usage. Adds builder classes for FailurePredicateOptions, FragmentOptions, SkipPredicateOptions, and UnitDownloaderOptions to improve configuration flexibility. Introduces DownloadTarget enum and SkipPredicate delegate for more granular download control. Refactors Fluent API interfaces and implementations to remove RawType generics and streamline usage. Adds Playwright and Stealth download strategies for extensibility.
---
Beam.Downloaders/Beam.Downloaders.csproj | 1 +
Beam.Downloaders/DownloadContext.cs | 6 +-
Beam.Downloaders/DownloadContextBuilder.cs | 34 +--
Beam.Downloaders/FailurePredicateOptions.cs | 9 +
.../FailurePredicateOptionsBuilder.cs | 56 +++++
Beam.Downloaders/FragmentOptions.cs | 7 +
Beam.Downloaders/FragmentOptionsBuilder.cs | 36 +++
Beam.Downloaders/SequentialDownloader.cs | 6 +-
.../SequentialFragmentDownloader.cs | 6 +-
Beam.Downloaders/SkipPredicateOptions.cs | 48 ++++
Beam.Downloaders/UnitDownloader.cs | 70 ++++--
Beam.Downloaders/UnitDownloaderOptions.cs | 210 +++---------------
.../UnitDownloaderOptionsBuilder.cs | 107 +++++++++
Beam.Downloaders/UnitFragmentDownloader.cs | 8 +-
Beam.Fluent/ContextStage.cs | 62 +++---
Beam.Fluent/Core/IContextStage.cs | 20 +-
Beam.Fluent/Core/IDownloadStage.cs | 8 +-
Beam.Fluent/Core/ITransformStage.cs | 4 +-
Beam.Fluent/DownloadStage.cs | 8 +-
Beam.Fluent/FluentDownload.cs | 14 +-
Beam.Fluent/TransformStage.cs | 6 +-
Beam.Models/DownloadTarget.cs | 28 +++
Beam.Models/SkipPredicate.cs | 5 +
Beam.Playwright/PlaywrightUnitDownloader.cs | 37 ++-
.../Strategies/IDownloadStrategy.cs | 9 +
.../Strategies/PageDownloadStrategy.cs | 15 ++
.../Strategies/WaitingDownloadStrategy.cs | 25 +++
Beam.Stealth/StealthFragmentDownloader.cs | 4 +-
Beam.Stealth/StealthFragmentPageDownloader.cs | 16 --
Beam.Stealth/StealthUnitDownloader.cs | 84 +------
Beam.Stealth/StealthUnitPageDownloader.cs | 33 ---
Beam.Stealth/Strategies/IDownloadStrategy.cs | 9 +
.../Strategies/PageDownloadStrategy.cs | 13 ++
.../Strategies/WaitingDownloadStrategy.cs | 83 +++++++
34 files changed, 648 insertions(+), 439 deletions(-)
create mode 100644 Beam.Downloaders/FailurePredicateOptions.cs
create mode 100644 Beam.Downloaders/FailurePredicateOptionsBuilder.cs
create mode 100644 Beam.Downloaders/FragmentOptions.cs
create mode 100644 Beam.Downloaders/FragmentOptionsBuilder.cs
create mode 100644 Beam.Downloaders/SkipPredicateOptions.cs
create mode 100644 Beam.Downloaders/UnitDownloaderOptionsBuilder.cs
create mode 100644 Beam.Models/DownloadTarget.cs
create mode 100644 Beam.Models/SkipPredicate.cs
create mode 100644 Beam.Playwright/Strategies/IDownloadStrategy.cs
create mode 100644 Beam.Playwright/Strategies/PageDownloadStrategy.cs
create mode 100644 Beam.Playwright/Strategies/WaitingDownloadStrategy.cs
delete mode 100644 Beam.Stealth/StealthFragmentPageDownloader.cs
delete mode 100644 Beam.Stealth/StealthUnitPageDownloader.cs
create mode 100644 Beam.Stealth/Strategies/IDownloadStrategy.cs
create mode 100644 Beam.Stealth/Strategies/PageDownloadStrategy.cs
create mode 100644 Beam.Stealth/Strategies/WaitingDownloadStrategy.cs
diff --git a/Beam.Downloaders/Beam.Downloaders.csproj b/Beam.Downloaders/Beam.Downloaders.csproj
index ab69f30..1526f93 100644
--- a/Beam.Downloaders/Beam.Downloaders.csproj
+++ b/Beam.Downloaders/Beam.Downloaders.csproj
@@ -14,6 +14,7 @@
+
diff --git a/Beam.Downloaders/DownloadContext.cs b/Beam.Downloaders/DownloadContext.cs
index 96009af..59e0ab1 100644
--- a/Beam.Downloaders/DownloadContext.cs
+++ b/Beam.Downloaders/DownloadContext.cs
@@ -8,14 +8,14 @@ namespace Beam.Downloaders {
//public delegate Task AsyncHtmlTransformer(HtmlDocument doc);
//public delegate Task AsyncBinaryTransformer(byte[] bin);
- public class DownloadContext {
+ public class DownloadContext {
private bool disposedValue;
public HttpClient Client { get; }
public HtmlWeb Web { get; }
public IProgress? DownloadReporter { get; set; }
public IProgress? RetryReporter { get; set; }
- public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; }
+ public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; }
public TimeSpan TimeOut { get; set; }
public IEnumerable Links { get; }
public CancellationToken CancellationToken { get; }
@@ -28,7 +28,7 @@ namespace Beam.Downloaders {
CancellationToken cancellationToken = default,
IProgress? downloadReporter = null,
IProgress? retryReporter = null,
- AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null,
+ AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null,
TimeSpan? timeOut = null,
ILogger? downloadLogger = null) {
ArgumentNullException.ThrowIfNull(web, nameof(web));
diff --git a/Beam.Downloaders/DownloadContextBuilder.cs b/Beam.Downloaders/DownloadContextBuilder.cs
index d92277a..41f5e13 100644
--- a/Beam.Downloaders/DownloadContextBuilder.cs
+++ b/Beam.Downloaders/DownloadContextBuilder.cs
@@ -5,12 +5,12 @@ using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
- public class DownloadContextBuilder {
+ public class DownloadContextBuilder {
private HtmlWeb _web;
private HttpClient _client;
private IProgress? _downloadReporter;
private IProgress? _retryReporter;
- private AsyncDownloadFailurePredicate?[] _asyncFailurePredicates = [];
+ private AsyncDownloadFailurePredicate?[] _asyncFailurePredicates = [];
private TimeSpan _timeOut;
private IEnumerable _links;
private CancellationToken _cancellationToken;
@@ -26,60 +26,60 @@ namespace Beam.Downloaders {
_links = [];
}
- public DownloadContextBuilder WithWeb(HtmlWeb web) {
+ public DownloadContextBuilder WithWeb(HtmlWeb web) {
_web = web;
return this;
}
- public DownloadContextBuilder WithClient(HttpClient client) {
+ public DownloadContextBuilder WithClient(HttpClient client) {
_client = client;
return this;
}
- public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) {
+ public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) {
_downloadReporter = downloadReporter;
return this;
}
- public DownloadContextBuilder WithRetryReporter(IProgress retryReporter) {
+ public DownloadContextBuilder WithRetryReporter(IProgress retryReporter) {
_retryReporter = retryReporter;
return this;
}
- public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate[] predicates) {
+ public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate[] predicates) {
_asyncFailurePredicates = predicates;
return this;
}
- public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) {
+ public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) {
_timeOut = timeOut;
return this;
}
- public DownloadContextBuilder WithLinks(IEnumerable links) {
+ public DownloadContextBuilder WithLinks(IEnumerable links) {
_links = links;
return this;
}
- public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) {
+ public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) {
_cancellationToken = cancellationToken;
return this;
}
- public DownloadContextBuilder WithCache(DocumentCache cache) {
+ public DownloadContextBuilder WithCache(DocumentCache cache) {
_cache = cache;
return this;
}
- public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) {
+ public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) {
_downloadLogger = downloadLogger;
return this;
}
- public DownloadContext Build() {
+ public DownloadContext Build() {
// Construct the DownloadContext using the collected values.
- var context = new DownloadContext(
+ var context = new DownloadContext(
web: _web,
client: _client,
links: _links,
@@ -100,15 +100,15 @@ namespace Beam.Downloaders {
return context;
}
- public static DownloadContextBuilder FromContext(DownloadContext existing) {
+ public static DownloadContextBuilder FromContext(DownloadContext existing) {
if (existing == null) throw new ArgumentNullException(nameof(existing));
- return new DownloadContextBuilder(existing.Client, existing.Web)
+ return new DownloadContextBuilder(existing.Client, existing.Web)
.WithLinks(existing.Links)
.WithCancellationToken(existing.CancellationToken)
.WithDownloadReporter(existing.DownloadReporter!)
.WithRetryReporter(existing.RetryReporter!)
- .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>())
+ .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>())
.WithTimeOut(existing.TimeOut)
.WithDownloadLogger(existing.DownloadLogger!)
.WithCache(existing.Cache);
diff --git a/Beam.Downloaders/FailurePredicateOptions.cs b/Beam.Downloaders/FailurePredicateOptions.cs
new file mode 100644
index 0000000..e804577
--- /dev/null
+++ b/Beam.Downloaders/FailurePredicateOptions.cs
@@ -0,0 +1,9 @@
+using Beam.Models;
+
+namespace Beam.Downloaders;
+
+public record class FailurePredicateOptions {
+ public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; }
+ public bool ProcessInParallel { get; init; } = false;
+ public int? ParallelThreads { get; init; }
+}
\ No newline at end of file
diff --git a/Beam.Downloaders/FailurePredicateOptionsBuilder.cs b/Beam.Downloaders/FailurePredicateOptionsBuilder.cs
new file mode 100644
index 0000000..d5ec367
--- /dev/null
+++ b/Beam.Downloaders/FailurePredicateOptionsBuilder.cs
@@ -0,0 +1,56 @@
+using Beam.Models;
+
+namespace Beam.Downloaders;
+
+public sealed class FailurePredicateOptionsBuilder
+{
+ private readonly System.Collections.Generic.List?> _predicates =
+ new System.Collections.Generic.List?>();
+ private bool _processInParallel = false;
+ private int? _parallelThreads = null;
+
+ public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate)
+ {
+ _predicates.Add(predicate);
+ return this;
+ }
+
+ public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates)
+ {
+ if (predicates == null) throw new System.ArgumentNullException(nameof(predicates));
+ _predicates.AddRange(predicates);
+ return this;
+ }
+
+ public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates)
+ {
+ _predicates.Clear();
+ if (predicates != null) _predicates.AddRange(predicates);
+ return this;
+ }
+
+ public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true)
+ {
+ _processInParallel = value;
+ return this;
+ }
+
+ public FailurePredicateOptionsBuilder WithParallelThreads(int? threads)
+ {
+ if (threads.HasValue && threads.Value <= 0)
+ throw new System.ArgumentOutOfRangeException(nameof(threads));
+ _parallelThreads = threads;
+ return this;
+ }
+
+ public FailurePredicateOptions Build()
+ {
+ var arr = _predicates.Count == 0 ? [] : _predicates.ToArray();
+ return new FailurePredicateOptions
+ {
+ AsyncDownloadFailurePredicates = arr,
+ ProcessInParallel = _processInParallel,
+ ParallelThreads = _parallelThreads
+ };
+ }
+}
\ No newline at end of file
diff --git a/Beam.Downloaders/FragmentOptions.cs b/Beam.Downloaders/FragmentOptions.cs
new file mode 100644
index 0000000..a00c698
--- /dev/null
+++ b/Beam.Downloaders/FragmentOptions.cs
@@ -0,0 +1,7 @@
+namespace Beam.Downloaders;
+
+public record class FragmentOptions {
+ public required int FragmentSize { get; init; }
+ public bool DownloadInParallel { get; init; } = false;
+ public int? ParallelThreads { get; init; }
+}
\ No newline at end of file
diff --git a/Beam.Downloaders/FragmentOptionsBuilder.cs b/Beam.Downloaders/FragmentOptionsBuilder.cs
new file mode 100644
index 0000000..49ea521
--- /dev/null
+++ b/Beam.Downloaders/FragmentOptionsBuilder.cs
@@ -0,0 +1,36 @@
+namespace Beam.Downloaders;
+
+public sealed class FragmentOptionsBuilder {
+ private int? _fragmentSize;
+ private bool _downloadInParallel = false;
+ private int? _parallelThreads = null;
+
+ public FragmentOptionsBuilder WithFragmentSize(int bytes) {
+ if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
+ _fragmentSize = bytes;
+ return this;
+ }
+
+ public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) {
+ _downloadInParallel = value;
+ return this;
+ }
+
+ public FragmentOptionsBuilder WithParallelThreads(int? threads) {
+ if (threads.HasValue && threads.Value <= 0)
+ throw new System.ArgumentOutOfRangeException(nameof(threads));
+ _parallelThreads = threads;
+ return this;
+ }
+
+ public FragmentOptions Build() {
+ if (!_fragmentSize.HasValue)
+ throw new System.InvalidOperationException("FragmentSize must be provided.");
+
+ return new FragmentOptions {
+ FragmentSize = _fragmentSize.Value,
+ DownloadInParallel = _downloadInParallel,
+ ParallelThreads = _parallelThreads
+ };
+ }
+}
\ No newline at end of file
diff --git a/Beam.Downloaders/SequentialDownloader.cs b/Beam.Downloaders/SequentialDownloader.cs
index 378000b..2f7db22 100644
--- a/Beam.Downloaders/SequentialDownloader.cs
+++ b/Beam.Downloaders/SequentialDownloader.cs
@@ -3,9 +3,9 @@ using Beam.Models;
using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
- public class SequentialDownloader : IAsyncEnumerator {
+ public class SequentialDownloader : IAsyncEnumerator {
public OutType Current { get; protected set; }
- public DownloadContext Context { get; }
+ public DownloadContext Context { get; }
public ILogger? Logger { get; set; }
public int LastOrder { get; set; } = 0;
@@ -13,7 +13,7 @@ namespace Beam.Downloaders {
public Func> GetUnitDownloader { get; set; }
- public SequentialDownloader(DownloadContext context, Func, IUnitDownloader> getUnitDownloader, ILogger? logger = null) {
+ public SequentialDownloader(DownloadContext context, Func> getUnitDownloader, ILogger? logger = null) {
Context = context;
Logger = logger;
LinksEnumerator = Context.Links.GetEnumerator();
diff --git a/Beam.Downloaders/SequentialFragmentDownloader.cs b/Beam.Downloaders/SequentialFragmentDownloader.cs
index b4c40d5..a7a1274 100644
--- a/Beam.Downloaders/SequentialFragmentDownloader.cs
+++ b/Beam.Downloaders/SequentialFragmentDownloader.cs
@@ -3,10 +3,10 @@ using Beam.Models;
using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
- public class SequentialFragmentDownloader : SequentialDownloader>> {
+ public class SequentialFragmentDownloader : SequentialDownloader>> {
public SequentialFragmentDownloader(
- DownloadContext context,
- Func, IUnitDownloader>>> getUnitDownloader,
+ DownloadContext context,
+ Func>>> getUnitDownloader,
ILogger? logger = null)
: base(context, getUnitDownloader, logger) {}
}
diff --git a/Beam.Downloaders/SkipPredicateOptions.cs b/Beam.Downloaders/SkipPredicateOptions.cs
new file mode 100644
index 0000000..2955cf7
--- /dev/null
+++ b/Beam.Downloaders/SkipPredicateOptions.cs
@@ -0,0 +1,48 @@
+using Beam.Models;
+
+namespace Beam.Downloaders;
+
+public class SkipPredicateOptions {
+ public required SkipPredicate?[]? SkipPredicates { get; init; }
+ public bool ProcessInParallel { get; init; } = false;
+ public int? ParallelThreads { get; init; }
+}
+
+public class SkipPredicateOptionsBuilder {
+ private List?> _skipPredicates { get; set; } = [];
+ private bool _processInParallel { get; set; } = false;
+ private int? _parallelThreads { get; set; }
+
+ public SkipPredicateOptionsBuilder WithSkipPredicate(SkipPredicate predicate, bool replace=false) {
+ if (replace)
+ _skipPredicates.Clear();
+ _skipPredicates.Add(predicate);
+ return this;
+ }
+
+ public SkipPredicateOptionsBuilder WithSkipPredicates(SkipPredicate[] predicates,
+ bool replace = true) {
+ if (replace)
+ _skipPredicates.Clear();
+ _skipPredicates.AddRange(predicates);
+ return this;
+ }
+
+ public SkipPredicateOptionsBuilder ProcessInParallel(bool processInParallel = true) {
+ _processInParallel = processInParallel;
+ return this;
+ }
+
+ public SkipPredicateOptionsBuilder WithParallelThreads(int parallelThreads) {
+ _parallelThreads = parallelThreads;
+ return this;
+ }
+
+ public SkipPredicateOptions Build() {
+ return new SkipPredicateOptions() {
+ SkipPredicates = _skipPredicates.ToArray(),
+ ParallelThreads = _parallelThreads,
+ ProcessInParallel = _processInParallel
+ };
+ }
+}
\ No newline at end of file
diff --git a/Beam.Downloaders/UnitDownloader.cs b/Beam.Downloaders/UnitDownloader.cs
index 400f47a..cad1ccc 100644
--- a/Beam.Downloaders/UnitDownloader.cs
+++ b/Beam.Downloaders/UnitDownloader.cs
@@ -1,4 +1,6 @@
-using Beam.Abstractions;
+using System.Diagnostics.CodeAnalysis;
+using System.Text;
+using Beam.Abstractions;
using Beam.Models;
using HtmlAgilityPack;
using File = System.IO.File;
@@ -11,12 +13,12 @@ namespace Beam.Downloaders {
///
///
///
- public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader where RawType : IDocument {
- public UnitDownloaderOptions Options { get; } = options;
+ public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader {
+ public UnitDownloaderOptions Options { get; } = options;
public HttpClient Client => Options.Client;
- public virtual AsyncTransformer Transformer => Options.AsyncTransformer;
-
- public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates =>
+ public virtual AsyncTransformer Transformer => Options.AsyncTransformer;
+
+ public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates =>
Options?.FailurePredicateOptions?.AsyncDownloadFailurePredicates;
public int LinksPerDownload { get; } = 1;
@@ -70,7 +72,7 @@ namespace Beam.Downloaders {
return new ByteDocument(url, bytes);
}
- protected virtual async Task IsFailure(RawType doc, CancellationToken ct) {
+ protected virtual async Task IsFailure(ByteDocument doc, CancellationToken ct) {
if (FailurePredicates is null)
return false;
if (!(Options?.FailurePredicateOptions?.ProcessInParallel ?? false))
@@ -103,19 +105,18 @@ namespace Beam.Downloaders {
return false;
}
- protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) {
- if (Options.DownloadFolder is not null && this is UnitDownloader) {
- var path = Path.Combine(Options.DownloadFolder, Path.GetRandomFileName());
+ protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) {
+ if (Options.DownloadFolder is not null) {
+ var path = Path.Combine(Options.DownloadFolder, options.GetFileNameForDownload(link, []));
await DownloadToFile(link, Options.BufferSize, path, progress, ct);
- return (RawType)(object)new StringDocument(link, path);
+ return new ByteDocument(link, Encoding.UTF8.GetBytes(path));
}
- if (this is UnitDownloader) {
- return (RawType)(object)(await DownloadToMemory(link, Options.BufferSize, progress, ct));
+ else {
+ return await DownloadToMemory(link, Options.BufferSize, progress, ct);
}
- throw new NotSupportedException(Exceptions.Exceptions.unit_downloader_limited_support);
}
- protected virtual async Task<(bool, OutType?)> Transform(RawType download, CancellationToken ct) {
+ protected virtual async Task<(bool, OutType?)> Transform(ByteDocument download, CancellationToken ct) {
try {
if (FailurePredicates is null || !(await IsFailure(download, ct)))
return (true, await Transformer(download));
@@ -131,6 +132,9 @@ namespace Beam.Downloaders {
return (false, default);
downProgress ??= new Progress();
+
+ if (ShouldSkip(link[0].Data, out var defaultType))
+ return (true, defaultType);
OutType? ot = default;
int tryCount = 0;
@@ -147,5 +151,41 @@ namespace Beam.Downloaders {
return (false, ot);
}
+
+ private bool ShouldSkip(string link, [NotNullWhen(true)] out OutType? outType) {
+ outType = default;
+ if (Options.SkipPredicateOptions?.SkipPredicates is null)
+ return false;
+ if (!Options.SkipPredicateOptions.ProcessInParallel)
+ foreach (var pred in Options.SkipPredicateOptions.SkipPredicates) {
+ if (pred is null)
+ continue;
+ if (pred(link, out outType))
+ return true;
+ }
+ else {
+ var shouldSkip = false;
+ OutType? _outType = default;
+ Parallel.ForEach(Options.SkipPredicateOptions.SkipPredicates, new ParallelOptions() {
+ MaxDegreeOfParallelism = Options?.FailurePredicateOptions?.ParallelThreads ?? 4
+ },
+ (predicate, parallelLoopState) => {
+ if (parallelLoopState.ShouldExitCurrentIteration)
+ return;
+ if (predicate == null)
+ return;
+ if (predicate(link, out var _innerLoopOutType)) {
+ Interlocked.CompareExchange(ref shouldSkip, true, false);
+ Interlocked.CompareExchange(ref _outType, _innerLoopOutType, default);
+ parallelLoopState.Break();
+ }
+ }
+ );
+ outType = _outType;
+ return shouldSkip;
+ }
+
+ return false;
+ }
}
}
diff --git a/Beam.Downloaders/UnitDownloaderOptions.cs b/Beam.Downloaders/UnitDownloaderOptions.cs
index dc99555..421c3b6 100644
--- a/Beam.Downloaders/UnitDownloaderOptions.cs
+++ b/Beam.Downloaders/UnitDownloaderOptions.cs
@@ -1,198 +1,38 @@
+using System.Diagnostics.CodeAnalysis;
+using System.Security.Cryptography;
+using System.Text;
using Beam.Models;
namespace Beam.Downloaders;
-public record class UnitDownloaderOptions {
+public record class UnitDownloaderOptions {
public HttpClient Client { get; init; } = new();
+
+ public DownloadTarget Target { get; init; } = DownloadTarget.URL;
- public FailurePredicateOptions? FailurePredicateOptions { get; init; }
+ public SkipPredicateOptions? SkipPredicateOptions { get; init; }
+ public FailurePredicateOptions? FailurePredicateOptions { get; init; }
public FragmentOptions? FragmentOptions { get; init; }
- public required AsyncTransformer AsyncTransformer { get; init; }
+ public required AsyncTransformer AsyncTransformer { get; init; }
+
+ ///
+ /// The location where the download is stored.
+ ///
+ ///
+ /// If not defined, UnitDownloader.TryDownload() downloads to memory.
+ ///
public string? DownloadFolder { get; init; } = null;
public int BufferSize { get; init; } = 80 * 1024; // 80kb
+
+ public string GetFileNameForDownload(string url, byte[] additionalData) {
+ byte[] bytes = [..Encoding.UTF8.GetBytes(url), ..additionalData];
+ var name = Convert.ToBase64String(System.IO.Hashing.XxHash64.Hash(bytes));
+ return name.Replace('+', '-').Replace('/', '_').Replace('=', ' ').Trim();
+ }
}
-public record class FailurePredicateOptions {
- public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; }
- public bool ProcessInParallel { get; init; } = false;
- public int? ParallelThreads { get; init; }
-}
+// ---------- UnitDownloaderOptions Builder ----------
-public record class FragmentOptions {
- public required int FragmentSize { get; init; }
- public bool DownloadInParallel { get; init; } = false;
- public int? ParallelThreads { get; init; }
-}
+// ---------- FailurePredicateOptions Builder ----------
-
- // ---------- UnitDownloaderOptions Builder ----------
- public sealed class UnitDownloaderOptionsBuilder
- {
- private HttpClient _client = new HttpClient();
- private FailurePredicateOptions? _failureOptions;
- private FragmentOptions? _fragmentOptions;
- private AsyncTransformer? _asyncTransformer;
- private string? _downloadFolder = null;
- private int _bufferSize = 80 * 1024;
-
- public UnitDownloaderOptionsBuilder WithClient(HttpClient client)
- {
- _client = client ?? throw new System.ArgumentNullException(nameof(client));
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options)
- {
- _failureOptions = options;
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure)
- {
- if (configure == null) throw new System.ArgumentNullException(nameof(configure));
- var b = new FailurePredicateOptionsBuilder();
- configure(b);
- _failureOptions = b.Build();
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options)
- {
- _fragmentOptions = options;
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithFragments(System.Action configure)
- {
- if (configure == null) throw new System.ArgumentNullException(nameof(configure));
- var b = new FragmentOptionsBuilder();
- configure(b);
- _fragmentOptions = b.Build();
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer)
- {
- _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer));
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder)
- {
- _downloadFolder = downloadFolder;
- return this;
- }
-
- public UnitDownloaderOptionsBuilder WithBufferSize(int bytes)
- {
- if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
- _bufferSize = bytes;
- return this;
- }
-
- public UnitDownloaderOptions Build()
- {
- if (_asyncTransformer == null)
- throw new System.InvalidOperationException("AsyncTransformer must be provided.");
-
- return new UnitDownloaderOptions
- {
- Client = _client,
- FailurePredicateOptions = _failureOptions,
- FragmentOptions = _fragmentOptions,
- AsyncTransformer = _asyncTransformer,
- DownloadFolder = _downloadFolder,
- BufferSize = _bufferSize
- };
- }
- }
-
- // ---------- FailurePredicateOptions Builder ----------
- public sealed class FailurePredicateOptionsBuilder
- {
- private readonly System.Collections.Generic.List?> _predicates =
- new System.Collections.Generic.List?>();
- private bool _processInParallel = false;
- private int? _parallelThreads = null;
-
- public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate)
- {
- _predicates.Add(predicate);
- return this;
- }
-
- public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates)
- {
- if (predicates == null) throw new System.ArgumentNullException(nameof(predicates));
- _predicates.AddRange(predicates);
- return this;
- }
-
- public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates)
- {
- _predicates.Clear();
- if (predicates != null) _predicates.AddRange(predicates);
- return this;
- }
-
- public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true)
- {
- _processInParallel = value;
- return this;
- }
-
- public FailurePredicateOptionsBuilder WithParallelThreads(int? threads)
- {
- if (threads.HasValue && threads.Value <= 0)
- throw new System.ArgumentOutOfRangeException(nameof(threads));
- _parallelThreads = threads;
- return this;
- }
-
- public FailurePredicateOptions Build()
- {
- var arr = _predicates.Count == 0 ? [] : _predicates.ToArray();
- return new FailurePredicateOptions
- {
- AsyncDownloadFailurePredicates = arr,
- ProcessInParallel = _processInParallel,
- ParallelThreads = _parallelThreads
- };
- }
- }
-
- // ---------- FragmentOptions Builder ----------
- public sealed class FragmentOptionsBuilder {
- private int? _fragmentSize;
- private bool _downloadInParallel = false;
- private int? _parallelThreads = null;
-
- public FragmentOptionsBuilder WithFragmentSize(int bytes) {
- if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
- _fragmentSize = bytes;
- return this;
- }
-
- public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) {
- _downloadInParallel = value;
- return this;
- }
-
- public FragmentOptionsBuilder WithParallelThreads(int? threads) {
- if (threads.HasValue && threads.Value <= 0)
- throw new System.ArgumentOutOfRangeException(nameof(threads));
- _parallelThreads = threads;
- return this;
- }
-
- public FragmentOptions Build() {
- if (!_fragmentSize.HasValue)
- throw new System.InvalidOperationException("FragmentSize must be provided.");
-
- return new FragmentOptions {
- FragmentSize = _fragmentSize.Value,
- DownloadInParallel = _downloadInParallel,
- ParallelThreads = _parallelThreads
- };
- }
- }
+// ---------- FragmentOptions Builder ----------
\ No newline at end of file
diff --git a/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs b/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs
new file mode 100644
index 0000000..4675092
--- /dev/null
+++ b/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs
@@ -0,0 +1,107 @@
+using Beam.Models;
+
+namespace Beam.Downloaders;
+
+public sealed class UnitDownloaderOptionsBuilder {
+ private DownloadTarget _target = DownloadTarget.URL;
+ private HttpClient _client = new HttpClient();
+ private FailurePredicateOptionsBuilder _failureOptionsBuilder = new();
+ private FailurePredicateOptions? _failurePredicateOptionsOverride = null;
+ private SkipPredicateOptionsBuilder _skipPredicateOptionsBuilder = new();
+ private SkipPredicateOptions? _skipPredicateOptionsOverride = null;
+ private FragmentOptions? _fragmentOptions;
+ private AsyncTransformer? _asyncTransformer;
+ private string? _downloadFolder = null;
+ private int _bufferSize = 80 * 1024;
+
+ public UnitDownloaderOptionsBuilder WithTarget(DownloadTarget target) {
+ _target = target;
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithClient(HttpClient client)
+ {
+ _client = client ?? throw new System.ArgumentNullException(nameof(client));
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options)
+ {
+ _failurePredicateOptionsOverride = options;
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure)
+ {
+ if (configure == null) throw new System.ArgumentNullException(nameof(configure));
+ configure(_failureOptionsBuilder);
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options)
+ {
+ _fragmentOptions = options;
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithSkipPredicates(Action> configure) {
+ if (configure == null) throw new ArgumentNullException(nameof(configure));
+ configure(_skipPredicateOptionsBuilder);
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithSkipPredicateOptions(
+ SkipPredicateOptions skipPredicateOptions) {
+ _skipPredicateOptionsOverride = skipPredicateOptions;
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithFragments(System.Action configure)
+ {
+ if (configure == null) throw new System.ArgumentNullException(nameof(configure));
+ var b = new FragmentOptionsBuilder();
+ configure(b);
+ _fragmentOptions = b.Build();
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer)
+ {
+ _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer));
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder)
+ {
+ _downloadFolder = downloadFolder;
+ return this;
+ }
+
+ public UnitDownloaderOptionsBuilder WithBufferSize(int bytes)
+ {
+ if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
+ _bufferSize = bytes;
+ return this;
+ }
+
+ public UnitDownloaderOptions Build()
+ {
+ if (_asyncTransformer == null)
+ throw new System.InvalidOperationException("AsyncTransformer must be provided.");
+
+ _failurePredicateOptionsOverride ??= _failureOptionsBuilder.Build();
+ _skipPredicateOptionsOverride ??= _skipPredicateOptionsBuilder.Build();
+
+ return new UnitDownloaderOptions
+ {
+ Target = _target,
+ Client = _client,
+ FailurePredicateOptions = _failurePredicateOptionsOverride,
+ SkipPredicateOptions = _skipPredicateOptionsOverride,
+ FragmentOptions = _fragmentOptions,
+ AsyncTransformer = _asyncTransformer,
+ DownloadFolder = _downloadFolder,
+ BufferSize = _bufferSize
+ };
+ }
+}
\ No newline at end of file
diff --git a/Beam.Downloaders/UnitFragmentDownloader.cs b/Beam.Downloaders/UnitFragmentDownloader.cs
index a399128..017fce4 100644
--- a/Beam.Downloaders/UnitFragmentDownloader.cs
+++ b/Beam.Downloaders/UnitFragmentDownloader.cs
@@ -5,12 +5,12 @@ using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
- public class UnitFragmentDownloader(UnitDownloaderOptions options,
- IUnitDownloader? internalDownloader = null) : IUnitDownloader>> where RawType : IDocument {
+ public class UnitFragmentDownloader(UnitDownloaderOptions options,
+ IUnitDownloader? internalDownloader = null) : IUnitDownloader>> {
- public UnitDownloaderOptions Options { get; } = options;
+ public UnitDownloaderOptions Options { get; } = options;
public int LinksPerDownload { get; set; }
- private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options);
+ private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options);
async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount, IProgress? downProgress, IProgress? tryProgress) {
Fragment> fragment = new Fragment>(link.Length);
diff --git a/Beam.Fluent/ContextStage.cs b/Beam.Fluent/ContextStage.cs
index c387c04..1d86a7f 100644
--- a/Beam.Fluent/ContextStage.cs
+++ b/Beam.Fluent/ContextStage.cs
@@ -8,14 +8,14 @@ using Beam.Downloaders;
namespace Beam.Fluent;
-internal sealed class ContextStage : IContextStage where RawType : IDocument {
- private readonly DownloadContextBuilder _ctxBuilder;
- private readonly AsyncTransformer _transformer;
+internal sealed class ContextStage : IContextStage {
+ private readonly DownloadContextBuilder _ctxBuilder;
+ private readonly AsyncTransformer _transformer;
private FragmentMode _fragmentMode = FragmentMode.Single;
private Channel _channel = Channel.Plain;
private readonly ContentKind _contentKind;
private int _parallelism = 4;
- private UnitDownloaderOptionsBuilder _optionsBuilder = new();
+ private UnitDownloaderOptionsBuilder _optionsBuilder = new();
// ──────────────── playwright ────────────────
private PlaywrightAsyncManipulator? _playwrightManipulator = null;
@@ -27,8 +27,8 @@ internal sealed class ContextStage : IContextStage ctxBuilder,
- AsyncTransformer transformer) {
+ public ContextStage(DownloadContextBuilder ctxBuilder,
+ AsyncTransformer transformer) {
_ctxBuilder = ctxBuilder;
_transformer = transformer;
_contentKind = transformer switch {
@@ -43,28 +43,28 @@ internal sealed class ContextStage : IContextStage Configure(Action> configure) {
+ public IContextStage Configure(Action configure) {
configure(_ctxBuilder);
return this;
}
- public IContextStage ConfigureUnitDownloaderOptions(
- Action> configure) {
+ public IContextStage ConfigureUnitDownloaderOptions(
+ Action> configure) {
configure(_optionsBuilder);
return this;
}
- public IContextStage WithParallelism(int degree) {
+ public IContextStage< OutType> WithParallelism(int degree) {
_parallelism = Math.Max(1, degree);
return this;
}
- public IContextStage WithTimeout(TimeSpan timeout) {
+ public IContextStage< OutType> WithTimeout(TimeSpan timeout) {
_ctxBuilder.WithTimeOut(timeout);
return this;
}
- public IContextStage WithRetryReporter(IProgress reporter) {
+ public IContextStage< OutType> WithRetryReporter(IProgress reporter) {
_ctxBuilder.WithRetryReporter(reporter);
return this;
}
@@ -73,7 +73,7 @@ internal sealed class ContextStage : IContextStage
///
///
- public IContextStage UseFragments() {
+ public IContextStage< OutType> UseFragments() {
if (_playwrightManipulator is not null)
_playwrightManipulator = null;
if (_channel == Channel.Playwright)
@@ -88,7 +88,7 @@ internal sealed class ContextStage : IContextStage
/// The page manipulator
///
- public IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator) {
+ public IContextStage< OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) {
if (_fragmentMode == FragmentMode.Fragmented)
_fragmentMode = FragmentMode.Single;
if (_stealthManipulator is not null)
@@ -99,7 +99,7 @@ internal sealed class ContextStage : IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
+ public IContextStage< OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
if (_playwrightManipulator is not null)
_playwrightManipulator = null;
@@ -109,7 +109,7 @@ internal sealed class ContextStage : IContextStage context) {
+ private object ConstructUnitDownloader(DownloadContext context) {
#region Utility functions
T To(object? o) where T : class
@@ -145,27 +145,19 @@ internal sealed class ContextStage : IContextStage new UnitFragmentDownloader(options),
+ => new UnitFragmentDownloader< OutType>(options),
// ──────────────── single ────────────────
(Channel.Plain, FragmentMode.Single, _)
- => new UnitDownloader(options),
+ => new UnitDownloader< OutType>(options),
// ──────────────── single playwright ────────────────
(Channel.Playwright, FragmentMode.Single, _)
- => new PlaywrightUnitDownloader(options, EnsureExists(_playwrightManipulator)),
- // ──────────────── single stealth file ────────────────
- (Channel.Stealth, FragmentMode.Single, ContentKind.File)
- => new StealthUnitPageDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
- // ──────────────── single stealth binary ────────────────
+ => new PlaywrightUnitDownloader< OutType>(options, EnsureExists(_playwrightManipulator)),
+ // ──────────────── single stealth ────────────────
(Channel.Stealth, FragmentMode.Single, ContentKind.Binary)
- => new StealthUnitDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
- // ──────────────── fragment stealth file ────────────────
- (Channel.Stealth, FragmentMode.Fragmented, ContentKind.File)
- => new StealthFragmentPageDownloader(options,
- EnsureExists(_stealthConfig),
- EnsureExists(_stealthManipulator)),
- // ──────────────── fragment stealth binary ────────────────
+ => new StealthUnitDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
+ // ──────────────── fragment stealth ────────────────
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary)
- => new StealthFragmentDownloader(options,
+ => new StealthFragmentDownloader< OutType>(options,
EnsureExists(_stealthConfig),
EnsureExists(_stealthManipulator)),
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
@@ -173,14 +165,14 @@ internal sealed class ContextStage : IContextStage> ConstructDownloader(DownloadContext context) {
- var copyOfContext = DownloadContextBuilder.FromContext(context).Build();
+ private IAsyncEnumerator> ConstructDownloader(DownloadContext context) {
+ var copyOfContext = DownloadContextBuilder.FromContext(context).Build();
return _fragmentMode switch {
- FragmentMode.Fragmented => new SequentialFragmentDownloader(
+ FragmentMode.Fragmented => new SequentialFragmentDownloader(
copyOfContext,
ctx => (IUnitDownloader>>)ConstructUnitDownloader(ctx),
context.DownloadLogger).UnwrapFragmented(),
- FragmentMode.Single => new SequentialDownloader(
+ FragmentMode.Single => new SequentialDownloader< OutType>(
copyOfContext,
ctx => (IUnitDownloader)ConstructUnitDownloader(ctx),
context.DownloadLogger).WrapOrdered(),
diff --git a/Beam.Fluent/Core/IContextStage.cs b/Beam.Fluent/Core/IContextStage.cs
index 5ad4486..2be7bf0 100644
--- a/Beam.Fluent/Core/IContextStage.cs
+++ b/Beam.Fluent/Core/IContextStage.cs
@@ -6,15 +6,15 @@ using Beam.Stealth;
namespace Beam.Fluent;
-public interface IContextStage