Refactor downloaders to use ByteDocument and add options builders

Replaces generic RawType with ByteDocument in downloaders and context classes, simplifying type usage. Adds builder classes for FailurePredicateOptions, FragmentOptions, SkipPredicateOptions, and UnitDownloaderOptions to improve configuration flexibility. Introduces DownloadTarget enum and SkipPredicate delegate for more granular download control. Refactors Fluent API interfaces and implementations to remove RawType generics and streamline usage. Adds Playwright and Stealth download strategies for extensibility.
This commit is contained in:
qwsdcvghyu89
2025-11-15 22:51:46 +11:00
parent 647b2b0f37
commit f52aa6123b
34 changed files with 648 additions and 439 deletions
+1
View File
@@ -14,6 +14,7 @@
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.9" />
<PackageReference Include="System.IO.Hashing" Version="10.0.0" />
<PackageReference Include="System.Linq.Async" Version="6.0.3" />
</ItemGroup>
+3 -3
View File
@@ -8,14 +8,14 @@ namespace Beam.Downloaders {
//public delegate Task<T> AsyncHtmlTransformer<T>(HtmlDocument doc);
//public delegate Task<T> AsyncBinaryTransformer<T>(byte[] bin);
public class DownloadContext<RawType> {
public class DownloadContext {
private bool disposedValue;
public HttpClient Client { get; }
public HtmlWeb Web { get; }
public IProgress<IDownloadReport>? DownloadReporter { get; set; }
public IProgress<IRetryReport>? RetryReporter { get; set; }
public AsyncDownloadFailurePredicate<RawType>?[]? AsyncFailurePredicates { get; }
public AsyncDownloadFailurePredicate<ByteDocument>?[]? AsyncFailurePredicates { get; }
public TimeSpan TimeOut { get; set; }
public IEnumerable<string> Links { get; }
public CancellationToken CancellationToken { get; }
@@ -28,7 +28,7 @@ namespace Beam.Downloaders {
CancellationToken cancellationToken = default,
IProgress<IDownloadReport>? downloadReporter = null,
IProgress<IRetryReport>? retryReporter = null,
AsyncDownloadFailurePredicate<RawType>?[]? asyncFailurePredicates = null,
AsyncDownloadFailurePredicate<ByteDocument>?[]? asyncFailurePredicates = null,
TimeSpan? timeOut = null,
ILogger? downloadLogger = null) {
ArgumentNullException.ThrowIfNull(web, nameof(web));
+17 -17
View File
@@ -5,12 +5,12 @@ using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
public class DownloadContextBuilder<RawType> {
public class DownloadContextBuilder {
private HtmlWeb _web;
private HttpClient _client;
private IProgress<IDownloadReport>? _downloadReporter;
private IProgress<IRetryReport>? _retryReporter;
private AsyncDownloadFailurePredicate<RawType>?[] _asyncFailurePredicates = [];
private AsyncDownloadFailurePredicate<ByteDocument>?[] _asyncFailurePredicates = [];
private TimeSpan _timeOut;
private IEnumerable<string> _links;
private CancellationToken _cancellationToken;
@@ -26,60 +26,60 @@ namespace Beam.Downloaders {
_links = [];
}
public DownloadContextBuilder<RawType> WithWeb(HtmlWeb web) {
public DownloadContextBuilder WithWeb(HtmlWeb web) {
_web = web;
return this;
}
public DownloadContextBuilder<RawType> WithClient(HttpClient client) {
public DownloadContextBuilder WithClient(HttpClient client) {
_client = client;
return this;
}
public DownloadContextBuilder<RawType> WithDownloadReporter(IProgress<IDownloadReport> downloadReporter) {
public DownloadContextBuilder WithDownloadReporter(IProgress<IDownloadReport> downloadReporter) {
_downloadReporter = downloadReporter;
return this;
}
public DownloadContextBuilder<RawType> WithRetryReporter(IProgress<IRetryReport> retryReporter) {
public DownloadContextBuilder WithRetryReporter(IProgress<IRetryReport> retryReporter) {
_retryReporter = retryReporter;
return this;
}
public DownloadContextBuilder<RawType> WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate<RawType>[] predicates) {
public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate<ByteDocument>[] predicates) {
_asyncFailurePredicates = predicates;
return this;
}
public DownloadContextBuilder<RawType> WithTimeOut(TimeSpan timeOut) {
public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) {
_timeOut = timeOut;
return this;
}
public DownloadContextBuilder<RawType> WithLinks(IEnumerable<string> links) {
public DownloadContextBuilder WithLinks(IEnumerable<string> links) {
_links = links;
return this;
}
public DownloadContextBuilder<RawType> WithCancellationToken(CancellationToken cancellationToken) {
public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) {
_cancellationToken = cancellationToken;
return this;
}
public DownloadContextBuilder<RawType> WithCache(DocumentCache cache) {
public DownloadContextBuilder WithCache(DocumentCache cache) {
_cache = cache;
return this;
}
public DownloadContextBuilder<RawType> WithDownloadLogger(ILogger downloadLogger) {
public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) {
_downloadLogger = downloadLogger;
return this;
}
public DownloadContext<RawType> Build() {
public DownloadContext Build() {
// Construct the DownloadContext<T> using the collected values.
var context = new DownloadContext<RawType>(
var context = new DownloadContext(
web: _web,
client: _client,
links: _links,
@@ -100,15 +100,15 @@ namespace Beam.Downloaders {
return context;
}
public static DownloadContextBuilder<RawType> FromContext(DownloadContext<RawType> existing) {
public static DownloadContextBuilder FromContext(DownloadContext existing) {
if (existing == null) throw new ArgumentNullException(nameof(existing));
return new DownloadContextBuilder<RawType>(existing.Client, existing.Web)
return new DownloadContextBuilder(existing.Client, existing.Web)
.WithLinks(existing.Links)
.WithCancellationToken(existing.CancellationToken)
.WithDownloadReporter(existing.DownloadReporter!)
.WithRetryReporter(existing.RetryReporter!)
.WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty<AsyncDownloadFailurePredicate<RawType>>())
.WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty<AsyncDownloadFailurePredicate<ByteDocument>>())
.WithTimeOut(existing.TimeOut)
.WithDownloadLogger(existing.DownloadLogger!)
.WithCache(existing.Cache);
@@ -0,0 +1,9 @@
using Beam.Models;
namespace Beam.Downloaders;
public record class FailurePredicateOptions<RawType> {
public required AsyncDownloadFailurePredicate<RawType>?[]? AsyncDownloadFailurePredicates { get; init; }
public bool ProcessInParallel { get; init; } = false;
public int? ParallelThreads { get; init; }
}
@@ -0,0 +1,56 @@
using Beam.Models;
namespace Beam.Downloaders;
public sealed class FailurePredicateOptionsBuilder<TRaw>
{
private readonly System.Collections.Generic.List<AsyncDownloadFailurePredicate<TRaw>?> _predicates =
new System.Collections.Generic.List<AsyncDownloadFailurePredicate<TRaw>?>();
private bool _processInParallel = false;
private int? _parallelThreads = null;
public FailurePredicateOptionsBuilder<TRaw> WithPredicate(AsyncDownloadFailurePredicate<TRaw>? predicate)
{
_predicates.Add(predicate);
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithPredicates(System.Collections.Generic.IEnumerable<AsyncDownloadFailurePredicate<TRaw>?> predicates)
{
if (predicates == null) throw new System.ArgumentNullException(nameof(predicates));
_predicates.AddRange(predicates);
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithPredicates(params AsyncDownloadFailurePredicate<TRaw>?[] predicates)
{
_predicates.Clear();
if (predicates != null) _predicates.AddRange(predicates);
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithProcessInParallel(bool value = true)
{
_processInParallel = value;
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithParallelThreads(int? threads)
{
if (threads.HasValue && threads.Value <= 0)
throw new System.ArgumentOutOfRangeException(nameof(threads));
_parallelThreads = threads;
return this;
}
public FailurePredicateOptions<TRaw> Build()
{
var arr = _predicates.Count == 0 ? [] : _predicates.ToArray();
return new FailurePredicateOptions<TRaw>
{
AsyncDownloadFailurePredicates = arr,
ProcessInParallel = _processInParallel,
ParallelThreads = _parallelThreads
};
}
}
+7
View File
@@ -0,0 +1,7 @@
namespace Beam.Downloaders;
public record class FragmentOptions {
public required int FragmentSize { get; init; }
public bool DownloadInParallel { get; init; } = false;
public int? ParallelThreads { get; init; }
}
@@ -0,0 +1,36 @@
namespace Beam.Downloaders;
public sealed class FragmentOptionsBuilder {
private int? _fragmentSize;
private bool _downloadInParallel = false;
private int? _parallelThreads = null;
public FragmentOptionsBuilder WithFragmentSize(int bytes) {
if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
_fragmentSize = bytes;
return this;
}
public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) {
_downloadInParallel = value;
return this;
}
public FragmentOptionsBuilder WithParallelThreads(int? threads) {
if (threads.HasValue && threads.Value <= 0)
throw new System.ArgumentOutOfRangeException(nameof(threads));
_parallelThreads = threads;
return this;
}
public FragmentOptions Build() {
if (!_fragmentSize.HasValue)
throw new System.InvalidOperationException("FragmentSize must be provided.");
return new FragmentOptions {
FragmentSize = _fragmentSize.Value,
DownloadInParallel = _downloadInParallel,
ParallelThreads = _parallelThreads
};
}
}
+3 -3
View File
@@ -3,9 +3,9 @@ using Beam.Models;
using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
public class SequentialDownloader<RawType, OutType> : IAsyncEnumerator<OutType> {
public class SequentialDownloader<OutType> : IAsyncEnumerator<OutType> {
public OutType Current { get; protected set; }
public DownloadContext<RawType> Context { get; }
public DownloadContext Context { get; }
public ILogger? Logger { get; set; }
public int LastOrder { get; set; } = 0;
@@ -13,7 +13,7 @@ namespace Beam.Downloaders {
public Func<IUnitDownloader<OutType>> GetUnitDownloader { get; set; }
public SequentialDownloader(DownloadContext<RawType> context, Func<DownloadContext<RawType>, IUnitDownloader<OutType>> getUnitDownloader, ILogger? logger = null) {
public SequentialDownloader(DownloadContext context, Func<DownloadContext, IUnitDownloader<OutType>> getUnitDownloader, ILogger? logger = null) {
Context = context;
Logger = logger;
LinksEnumerator = Context.Links.GetEnumerator();
@@ -3,10 +3,10 @@ using Beam.Models;
using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
public class SequentialFragmentDownloader<RawType, OutType> : SequentialDownloader<RawType, Fragment<Ordered<OutType>>> {
public class SequentialFragmentDownloader<OutType> : SequentialDownloader<Fragment<Ordered<OutType>>> {
public SequentialFragmentDownloader(
DownloadContext<RawType> context,
Func<DownloadContext<RawType>, IUnitDownloader<Fragment<Ordered<OutType>>>> getUnitDownloader,
DownloadContext context,
Func<DownloadContext, IUnitDownloader<Fragment<Ordered<OutType>>>> getUnitDownloader,
ILogger? logger = null)
: base(context, getUnitDownloader, logger) {}
}
+48
View File
@@ -0,0 +1,48 @@
using Beam.Models;
namespace Beam.Downloaders;
public class SkipPredicateOptions<OutType> {
public required SkipPredicate<OutType>?[]? SkipPredicates { get; init; }
public bool ProcessInParallel { get; init; } = false;
public int? ParallelThreads { get; init; }
}
public class SkipPredicateOptionsBuilder<OutType> {
private List<SkipPredicate<OutType>?> _skipPredicates { get; set; } = [];
private bool _processInParallel { get; set; } = false;
private int? _parallelThreads { get; set; }
public SkipPredicateOptionsBuilder<OutType> WithSkipPredicate(SkipPredicate<OutType> predicate, bool replace=false) {
if (replace)
_skipPredicates.Clear();
_skipPredicates.Add(predicate);
return this;
}
public SkipPredicateOptionsBuilder<OutType> WithSkipPredicates(SkipPredicate<OutType>[] predicates,
bool replace = true) {
if (replace)
_skipPredicates.Clear();
_skipPredicates.AddRange(predicates);
return this;
}
public SkipPredicateOptionsBuilder<OutType> ProcessInParallel(bool processInParallel = true) {
_processInParallel = processInParallel;
return this;
}
public SkipPredicateOptionsBuilder<OutType> WithParallelThreads(int parallelThreads) {
_parallelThreads = parallelThreads;
return this;
}
public SkipPredicateOptions<OutType> Build() {
return new SkipPredicateOptions<OutType>() {
SkipPredicates = _skipPredicates.ToArray(),
ParallelThreads = _parallelThreads,
ProcessInParallel = _processInParallel
};
}
}
+55 -15
View File
@@ -1,4 +1,6 @@
using Beam.Abstractions;
using System.Diagnostics.CodeAnalysis;
using System.Text;
using Beam.Abstractions;
using Beam.Models;
using HtmlAgilityPack;
using File = System.IO.File;
@@ -11,12 +13,12 @@ namespace Beam.Downloaders {
/// <param name="web"></param>
/// <param name="transformer"></param>
/// <param name="failurePredicate"></param>
public class UnitDownloader<RawType, OutType>(UnitDownloaderOptions<RawType, OutType> options) : IUnitDownloader<OutType> where RawType : IDocument {
public UnitDownloaderOptions<RawType, OutType> Options { get; } = options;
public class UnitDownloader<OutType>(UnitDownloaderOptions<OutType> options) : IUnitDownloader<OutType> {
public UnitDownloaderOptions<OutType> Options { get; } = options;
public HttpClient Client => Options.Client;
public virtual AsyncTransformer<RawType, OutType> Transformer => Options.AsyncTransformer;
public virtual AsyncDownloadFailurePredicate<RawType>?[]? FailurePredicates =>
public virtual AsyncTransformer<ByteDocument, OutType> Transformer => Options.AsyncTransformer;
public virtual AsyncDownloadFailurePredicate<ByteDocument>?[]? FailurePredicates =>
Options?.FailurePredicateOptions?.AsyncDownloadFailurePredicates;
public int LinksPerDownload { get; } = 1;
@@ -70,7 +72,7 @@ namespace Beam.Downloaders {
return new ByteDocument(url, bytes);
}
protected virtual async Task<bool> IsFailure(RawType doc, CancellationToken ct) {
protected virtual async Task<bool> IsFailure(ByteDocument doc, CancellationToken ct) {
if (FailurePredicates is null)
return false;
if (!(Options?.FailurePredicateOptions?.ProcessInParallel ?? false))
@@ -103,19 +105,18 @@ namespace Beam.Downloaders {
return false;
}
protected virtual async Task<RawType> _Download(string link, IProgress<IDownloadReport> progress, CancellationToken ct) {
if (Options.DownloadFolder is not null && this is UnitDownloader<StringDocument, OutType>) {
var path = Path.Combine(Options.DownloadFolder, Path.GetRandomFileName());
protected virtual async Task<ByteDocument> _Download(string link, IProgress<IDownloadReport> progress, CancellationToken ct) {
if (Options.DownloadFolder is not null) {
var path = Path.Combine(Options.DownloadFolder, options.GetFileNameForDownload(link, []));
await DownloadToFile(link, Options.BufferSize, path, progress, ct);
return (RawType)(object)new StringDocument(link, path);
return new ByteDocument(link, Encoding.UTF8.GetBytes(path));
}
if (this is UnitDownloader<ByteDocument, OutType>) {
return (RawType)(object)(await DownloadToMemory(link, Options.BufferSize, progress, ct));
else {
return await DownloadToMemory(link, Options.BufferSize, progress, ct);
}
throw new NotSupportedException(Exceptions.Exceptions.unit_downloader_limited_support);
}
protected virtual async Task<(bool, OutType?)> Transform(RawType download, CancellationToken ct) {
protected virtual async Task<(bool, OutType?)> Transform(ByteDocument download, CancellationToken ct) {
try {
if (FailurePredicates is null || !(await IsFailure(download, ct)))
return (true, await Transformer(download));
@@ -131,6 +132,9 @@ namespace Beam.Downloaders {
return (false, default);
downProgress ??= new Progress<IDownloadReport>();
if (ShouldSkip(link[0].Data, out var defaultType))
return (true, defaultType);
OutType? ot = default;
int tryCount = 0;
@@ -147,5 +151,41 @@ namespace Beam.Downloaders {
return (false, ot);
}
private bool ShouldSkip(string link, [NotNullWhen(true)] out OutType? outType) {
outType = default;
if (Options.SkipPredicateOptions?.SkipPredicates is null)
return false;
if (!Options.SkipPredicateOptions.ProcessInParallel)
foreach (var pred in Options.SkipPredicateOptions.SkipPredicates) {
if (pred is null)
continue;
if (pred(link, out outType))
return true;
}
else {
var shouldSkip = false;
OutType? _outType = default;
Parallel.ForEach(Options.SkipPredicateOptions.SkipPredicates, new ParallelOptions() {
MaxDegreeOfParallelism = Options?.FailurePredicateOptions?.ParallelThreads ?? 4
},
(predicate, parallelLoopState) => {
if (parallelLoopState.ShouldExitCurrentIteration)
return;
if (predicate == null)
return;
if (predicate(link, out var _innerLoopOutType)) {
Interlocked.CompareExchange(ref shouldSkip, true, false);
Interlocked.CompareExchange(ref _outType, _innerLoopOutType, default);
parallelLoopState.Break();
}
}
);
outType = _outType;
return shouldSkip;
}
return false;
}
}
}
+25 -185
View File
@@ -1,198 +1,38 @@
using System.Diagnostics.CodeAnalysis;
using System.Security.Cryptography;
using System.Text;
using Beam.Models;
namespace Beam.Downloaders;
public record class UnitDownloaderOptions<RawType, OutType> {
public record class UnitDownloaderOptions<OutType> {
public HttpClient Client { get; init; } = new();
public DownloadTarget Target { get; init; } = DownloadTarget.URL;
public FailurePredicateOptions<RawType>? FailurePredicateOptions { get; init; }
public SkipPredicateOptions<OutType>? SkipPredicateOptions { get; init; }
public FailurePredicateOptions<ByteDocument>? FailurePredicateOptions { get; init; }
public FragmentOptions? FragmentOptions { get; init; }
public required AsyncTransformer<RawType, OutType> AsyncTransformer { get; init; }
public required AsyncTransformer<ByteDocument, OutType> AsyncTransformer { get; init; }
/// <summary>
/// The location where the download is stored.
/// </summary>
/// <remarks>
/// If not defined, <c>UnitDownloader.TryDownload()</c> downloads to memory.
/// </remarks>
public string? DownloadFolder { get; init; } = null;
public int BufferSize { get; init; } = 80 * 1024; // 80kb
public string GetFileNameForDownload(string url, byte[] additionalData) {
byte[] bytes = [..Encoding.UTF8.GetBytes(url), ..additionalData];
var name = Convert.ToBase64String(System.IO.Hashing.XxHash64.Hash(bytes));
return name.Replace('+', '-').Replace('/', '_').Replace('=', ' ').Trim();
}
}
public record class FailurePredicateOptions<RawType> {
public required AsyncDownloadFailurePredicate<RawType>?[]? AsyncDownloadFailurePredicates { get; init; }
public bool ProcessInParallel { get; init; } = false;
public int? ParallelThreads { get; init; }
}
// ---------- UnitDownloaderOptions Builder ----------
public record class FragmentOptions {
public required int FragmentSize { get; init; }
public bool DownloadInParallel { get; init; } = false;
public int? ParallelThreads { get; init; }
}
// ---------- FailurePredicateOptions Builder ----------
// ---------- UnitDownloaderOptions Builder ----------
public sealed class UnitDownloaderOptionsBuilder<TRaw, TOut>
{
private HttpClient _client = new HttpClient();
private FailurePredicateOptions<TRaw>? _failureOptions;
private FragmentOptions? _fragmentOptions;
private AsyncTransformer<TRaw, TOut>? _asyncTransformer;
private string? _downloadFolder = null;
private int _bufferSize = 80 * 1024;
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithClient(HttpClient client)
{
_client = client ?? throw new System.ArgumentNullException(nameof(client));
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithFailurePredicateOptions(FailurePredicateOptions<TRaw>? options)
{
_failureOptions = options;
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithFailurePredicates(System.Action<FailurePredicateOptionsBuilder<TRaw>> configure)
{
if (configure == null) throw new System.ArgumentNullException(nameof(configure));
var b = new FailurePredicateOptionsBuilder<TRaw>();
configure(b);
_failureOptions = b.Build();
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithFragmentOptions(FragmentOptions? options)
{
_fragmentOptions = options;
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithFragments(System.Action<FragmentOptionsBuilder> configure)
{
if (configure == null) throw new System.ArgumentNullException(nameof(configure));
var b = new FragmentOptionsBuilder();
configure(b);
_fragmentOptions = b.Build();
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithAsyncTransformer(AsyncTransformer<TRaw, TOut> transformer)
{
_asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer));
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithDownloadFolder(string? downloadFolder)
{
_downloadFolder = downloadFolder;
return this;
}
public UnitDownloaderOptionsBuilder<TRaw, TOut> WithBufferSize(int bytes)
{
if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
_bufferSize = bytes;
return this;
}
public UnitDownloaderOptions<TRaw, TOut> Build()
{
if (_asyncTransformer == null)
throw new System.InvalidOperationException("AsyncTransformer must be provided.");
return new UnitDownloaderOptions<TRaw, TOut>
{
Client = _client,
FailurePredicateOptions = _failureOptions,
FragmentOptions = _fragmentOptions,
AsyncTransformer = _asyncTransformer,
DownloadFolder = _downloadFolder,
BufferSize = _bufferSize
};
}
}
// ---------- FailurePredicateOptions Builder ----------
public sealed class FailurePredicateOptionsBuilder<TRaw>
{
private readonly System.Collections.Generic.List<AsyncDownloadFailurePredicate<TRaw>?> _predicates =
new System.Collections.Generic.List<AsyncDownloadFailurePredicate<TRaw>?>();
private bool _processInParallel = false;
private int? _parallelThreads = null;
public FailurePredicateOptionsBuilder<TRaw> WithPredicate(AsyncDownloadFailurePredicate<TRaw>? predicate)
{
_predicates.Add(predicate);
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithPredicates(System.Collections.Generic.IEnumerable<AsyncDownloadFailurePredicate<TRaw>?> predicates)
{
if (predicates == null) throw new System.ArgumentNullException(nameof(predicates));
_predicates.AddRange(predicates);
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithPredicates(params AsyncDownloadFailurePredicate<TRaw>?[] predicates)
{
_predicates.Clear();
if (predicates != null) _predicates.AddRange(predicates);
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithProcessInParallel(bool value = true)
{
_processInParallel = value;
return this;
}
public FailurePredicateOptionsBuilder<TRaw> WithParallelThreads(int? threads)
{
if (threads.HasValue && threads.Value <= 0)
throw new System.ArgumentOutOfRangeException(nameof(threads));
_parallelThreads = threads;
return this;
}
public FailurePredicateOptions<TRaw> Build()
{
var arr = _predicates.Count == 0 ? [] : _predicates.ToArray();
return new FailurePredicateOptions<TRaw>
{
AsyncDownloadFailurePredicates = arr,
ProcessInParallel = _processInParallel,
ParallelThreads = _parallelThreads
};
}
}
// ---------- FragmentOptions Builder ----------
public sealed class FragmentOptionsBuilder {
private int? _fragmentSize;
private bool _downloadInParallel = false;
private int? _parallelThreads = null;
public FragmentOptionsBuilder WithFragmentSize(int bytes) {
if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
_fragmentSize = bytes;
return this;
}
public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) {
_downloadInParallel = value;
return this;
}
public FragmentOptionsBuilder WithParallelThreads(int? threads) {
if (threads.HasValue && threads.Value <= 0)
throw new System.ArgumentOutOfRangeException(nameof(threads));
_parallelThreads = threads;
return this;
}
public FragmentOptions Build() {
if (!_fragmentSize.HasValue)
throw new System.InvalidOperationException("FragmentSize must be provided.");
return new FragmentOptions {
FragmentSize = _fragmentSize.Value,
DownloadInParallel = _downloadInParallel,
ParallelThreads = _parallelThreads
};
}
}
// ---------- FragmentOptions Builder ----------
@@ -0,0 +1,107 @@
using Beam.Models;
namespace Beam.Downloaders;
public sealed class UnitDownloaderOptionsBuilder<OutType> {
private DownloadTarget _target = DownloadTarget.URL;
private HttpClient _client = new HttpClient();
private FailurePredicateOptionsBuilder<ByteDocument> _failureOptionsBuilder = new();
private FailurePredicateOptions<ByteDocument>? _failurePredicateOptionsOverride = null;
private SkipPredicateOptionsBuilder<OutType> _skipPredicateOptionsBuilder = new();
private SkipPredicateOptions<OutType>? _skipPredicateOptionsOverride = null;
private FragmentOptions? _fragmentOptions;
private AsyncTransformer<ByteDocument, OutType>? _asyncTransformer;
private string? _downloadFolder = null;
private int _bufferSize = 80 * 1024;
public UnitDownloaderOptionsBuilder<OutType> WithTarget(DownloadTarget target) {
_target = target;
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithClient(HttpClient client)
{
_client = client ?? throw new System.ArgumentNullException(nameof(client));
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithFailurePredicateOptions(FailurePredicateOptions<ByteDocument>? options)
{
_failurePredicateOptionsOverride = options;
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithFailurePredicates(System.Action<FailurePredicateOptionsBuilder<ByteDocument>> configure)
{
if (configure == null) throw new System.ArgumentNullException(nameof(configure));
configure(_failureOptionsBuilder);
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithFragmentOptions(FragmentOptions? options)
{
_fragmentOptions = options;
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithSkipPredicates(Action<SkipPredicateOptionsBuilder<OutType>> configure) {
if (configure == null) throw new ArgumentNullException(nameof(configure));
configure(_skipPredicateOptionsBuilder);
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithSkipPredicateOptions(
SkipPredicateOptions<OutType> skipPredicateOptions) {
_skipPredicateOptionsOverride = skipPredicateOptions;
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithFragments(System.Action<FragmentOptionsBuilder> configure)
{
if (configure == null) throw new System.ArgumentNullException(nameof(configure));
var b = new FragmentOptionsBuilder();
configure(b);
_fragmentOptions = b.Build();
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithAsyncTransformer(AsyncTransformer<ByteDocument, OutType> transformer)
{
_asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer));
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithDownloadFolder(string? downloadFolder)
{
_downloadFolder = downloadFolder;
return this;
}
public UnitDownloaderOptionsBuilder<OutType> WithBufferSize(int bytes)
{
if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes));
_bufferSize = bytes;
return this;
}
public UnitDownloaderOptions<OutType> Build()
{
if (_asyncTransformer == null)
throw new System.InvalidOperationException("AsyncTransformer must be provided.");
_failurePredicateOptionsOverride ??= _failureOptionsBuilder.Build();
_skipPredicateOptionsOverride ??= _skipPredicateOptionsBuilder.Build();
return new UnitDownloaderOptions<OutType>
{
Target = _target,
Client = _client,
FailurePredicateOptions = _failurePredicateOptionsOverride,
SkipPredicateOptions = _skipPredicateOptionsOverride,
FragmentOptions = _fragmentOptions,
AsyncTransformer = _asyncTransformer,
DownloadFolder = _downloadFolder,
BufferSize = _bufferSize
};
}
}
+4 -4
View File
@@ -5,12 +5,12 @@ using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
namespace Beam.Downloaders {
public class UnitFragmentDownloader<RawType, OutType>(UnitDownloaderOptions<RawType, OutType> options,
IUnitDownloader<OutType>? internalDownloader = null) : IUnitDownloader<Fragment<Ordered<OutType>>> where RawType : IDocument {
public class UnitFragmentDownloader<OutType>(UnitDownloaderOptions<OutType> options,
IUnitDownloader<OutType>? internalDownloader = null) : IUnitDownloader<Fragment<Ordered<OutType>>> {
public UnitDownloaderOptions<RawType, OutType> Options { get; } = options;
public UnitDownloaderOptions<OutType> Options { get; } = options;
public int LinksPerDownload { get; set; }
private IUnitDownloader<OutType> UnitDownloader { get; } = internalDownloader ?? new UnitDownloader<RawType, OutType>(options);
private IUnitDownloader<OutType> UnitDownloader { get; } = internalDownloader ?? new UnitDownloader<OutType>(options);
async Task<(bool, Fragment<Ordered<OutType>>?)> IUnitDownloader<Fragment<Ordered<OutType>>>.TryDownload(IOrdered<string>[] link, CancellationToken ct, int maximumRetryCount, IProgress<IDownloadReport>? downProgress, IProgress<IRetryReport>? tryProgress) {
Fragment<Ordered<OutType>> fragment = new Fragment<Ordered<OutType>>(link.Length);
+27 -35
View File
@@ -8,14 +8,14 @@ using Beam.Downloaders;
namespace Beam.Fluent;
internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, OutType> where RawType : IDocument {
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
private readonly AsyncTransformer<RawType, OutType> _transformer;
internal sealed class ContextStage<OutType> : IContextStage<OutType> {
private readonly DownloadContextBuilder _ctxBuilder;
private readonly AsyncTransformer<ByteDocument, OutType> _transformer;
private FragmentMode _fragmentMode = FragmentMode.Single;
private Channel _channel = Channel.Plain;
private readonly ContentKind _contentKind;
private int _parallelism = 4;
private UnitDownloaderOptionsBuilder<RawType, OutType> _optionsBuilder = new();
private UnitDownloaderOptionsBuilder<OutType> _optionsBuilder = new();
// ──────────────── playwright ────────────────
private PlaywrightAsyncManipulator? _playwrightManipulator = null;
@@ -27,8 +27,8 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
// ────────────────────────────────────────────
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder,
AsyncTransformer<RawType, OutType> transformer) {
public ContextStage(DownloadContextBuilder ctxBuilder,
AsyncTransformer<ByteDocument, OutType> transformer) {
_ctxBuilder = ctxBuilder;
_transformer = transformer;
_contentKind = transformer switch {
@@ -43,28 +43,28 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
.WithAsyncTransformer(_transformer);
}
public IContextStage<RawType, OutType> Configure(Action<DownloadContextBuilder<RawType>> configure) {
public IContextStage<OutType> Configure(Action<DownloadContextBuilder> configure) {
configure(_ctxBuilder);
return this;
}
public IContextStage<RawType, OutType> ConfigureUnitDownloaderOptions(
Action<UnitDownloaderOptionsBuilder<RawType, OutType>> configure) {
public IContextStage<OutType> ConfigureUnitDownloaderOptions(
Action<UnitDownloaderOptionsBuilder< OutType>> configure) {
configure(_optionsBuilder);
return this;
}
public IContextStage<RawType, OutType> WithParallelism(int degree) {
public IContextStage< OutType> WithParallelism(int degree) {
_parallelism = Math.Max(1, degree);
return this;
}
public IContextStage<RawType, OutType> WithTimeout(TimeSpan timeout) {
public IContextStage< OutType> WithTimeout(TimeSpan timeout) {
_ctxBuilder.WithTimeOut(timeout);
return this;
}
public IContextStage<RawType, OutType> WithRetryReporter(IProgress<IRetryReport> reporter) {
public IContextStage< OutType> WithRetryReporter(IProgress<IRetryReport> reporter) {
_ctxBuilder.WithRetryReporter(reporter);
return this;
}
@@ -73,7 +73,7 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePlaywright(PlaywrightAsyncManipulator)"/>
/// </summary>
/// <returns></returns>
public IContextStage<RawType, OutType> UseFragments() {
public IContextStage< OutType> UseFragments() {
if (_playwrightManipulator is not null)
_playwrightManipulator = null;
if (_channel == Channel.Playwright)
@@ -88,7 +88,7 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
/// </summary>
/// <param name="manipulator">The page manipulator</param>
/// <returns></returns>
public IContextStage<RawType, OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) {
public IContextStage< OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) {
if (_fragmentMode == FragmentMode.Fragmented)
_fragmentMode = FragmentMode.Single;
if (_stealthManipulator is not null)
@@ -99,7 +99,7 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
return this;
}
public IContextStage<RawType, OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
public IContextStage< OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
if (_playwrightManipulator is not null)
_playwrightManipulator = null;
@@ -109,7 +109,7 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
return this;
}
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
private object ConstructUnitDownloader(DownloadContext context) {
#region Utility functions
T To<T>(object? o) where T : class
@@ -145,27 +145,19 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
return (_channel, _fragmentMode, _contentKind) switch {
// ──────────────── fragmented ────────────────
(Channel.Plain, FragmentMode.Fragmented, _)
=> new UnitFragmentDownloader<RawType, OutType>(options),
=> new UnitFragmentDownloader< OutType>(options),
// ──────────────── single ────────────────
(Channel.Plain, FragmentMode.Single, _)
=> new UnitDownloader<RawType, OutType>(options),
=> new UnitDownloader< OutType>(options),
// ──────────────── single playwright ────────────────
(Channel.Playwright, FragmentMode.Single, _)
=> new PlaywrightUnitDownloader<RawType, OutType>(options, EnsureExists(_playwrightManipulator)),
// ──────────────── single stealth file ────────────────
(Channel.Stealth, FragmentMode.Single, ContentKind.File)
=> new StealthUnitPageDownloader<RawType, OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
// ──────────────── single stealth binary ────────────────
=> new PlaywrightUnitDownloader< OutType>(options, EnsureExists(_playwrightManipulator)),
// ──────────────── single stealth ────────────────
(Channel.Stealth, FragmentMode.Single, ContentKind.Binary)
=> new StealthUnitDownloader<RawType, OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
// ──────────────── fragment stealth file ────────────────
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.File)
=> new StealthFragmentPageDownloader<RawType, OutType>(options,
EnsureExists(_stealthConfig),
EnsureExists(_stealthManipulator)),
// ──────────────── fragment stealth binary ────────────────
=> new StealthUnitDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
// ──────────────── fragment stealth ────────────────
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary)
=> new StealthFragmentDownloader<RawType, OutType>(options,
=> new StealthFragmentDownloader< OutType>(options,
EnsureExists(_stealthConfig),
EnsureExists(_stealthManipulator)),
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
@@ -173,14 +165,14 @@ internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, Ou
};
}
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
var copyOfContext = DownloadContextBuilder<RawType>.FromContext(context).Build();
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext context) {
var copyOfContext = DownloadContextBuilder.FromContext(context).Build();
return _fragmentMode switch {
FragmentMode.Fragmented => new SequentialFragmentDownloader<RawType, OutType>(
FragmentMode.Fragmented => new SequentialFragmentDownloader<OutType>(
copyOfContext,
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
context.DownloadLogger).UnwrapFragmented(),
FragmentMode.Single => new SequentialDownloader<RawType, OutType>(
FragmentMode.Single => new SequentialDownloader< OutType>(
copyOfContext,
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
context.DownloadLogger).WrapOrdered(),
+10 -10
View File
@@ -6,15 +6,15 @@ using Beam.Stealth;
namespace Beam.Fluent;
public interface IContextStage<RawType, OutType> {
IContextStage<RawType, OutType> Configure(Action<DownloadContextBuilder<RawType>> configure);
IContextStage<RawType, OutType> WithParallelism(int degree);
IContextStage<RawType, OutType> WithTimeout(TimeSpan timeout);
IContextStage<RawType, OutType> WithRetryReporter(IProgress<IRetryReport> reporter);
IContextStage<RawType, OutType> UseFragments();
IContextStage<RawType, OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator);
IContextStage<RawType, OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config);
IContextStage<RawType, OutType> ConfigureUnitDownloaderOptions(
Action<UnitDownloaderOptionsBuilder<RawType, OutType>> configure);
public interface IContextStage<OutType> {
IContextStage<OutType> Configure(Action<DownloadContextBuilder> configure);
IContextStage<OutType> WithParallelism(int degree);
IContextStage<OutType> WithTimeout(TimeSpan timeout);
IContextStage<OutType> WithRetryReporter(IProgress<IRetryReport> reporter);
IContextStage<OutType> UseFragments();
IContextStage<OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator);
IContextStage<OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config);
IContextStage<OutType> ConfigureUnitDownloaderOptions(
Action<UnitDownloaderOptionsBuilder<OutType>> configure);
DownloadEnumerable<OutType> Build();
}
+4 -4
View File
@@ -2,10 +2,10 @@
namespace Beam.Fluent;
public interface IDownloadStage<RawType, OutType> {
IDownloadStage<RawType, OutType> SaveToDirectory(string dir);
IDownloadStage<RawType, OutType> SaveToFiles(IEnumerable<string> files);
IDownloadStage<RawType, OutType> SaveToMemory(ConcurrentBag<OutType> bag);
public interface IDownloadStage<OutType> {
IDownloadStage<OutType> SaveToDirectory(string dir);
IDownloadStage<OutType> SaveToFiles(IEnumerable<string> files);
IDownloadStage<OutType> SaveToMemory(ConcurrentBag<OutType> bag);
void WaitForDownload();
Task WaitForDownloadAsync();
DownloadEnumerable<OutType> AsAsyncEnumerable();
+2 -2
View File
@@ -3,6 +3,6 @@ using Beam.Models;
namespace Beam.Fluent;
public interface ITransformStage<RawType, OutType> {
IContextStage<RawType, OutType> WithTransformer(AsyncTransformer<RawType, OutType> factory);
public interface ITransformStage<OutType> {
IContextStage<OutType> WithTransformer(AsyncTransformer<ByteDocument, OutType> factory);
}
+4 -4
View File
@@ -5,7 +5,7 @@ using Beam.Models;
namespace Beam.Fluent;
internal sealed class DownloadStage<RawType, OutType>(DownloadEnumerable<OutType> download) : IDownloadStage<RawType, OutType> where RawType : IDocument {
internal sealed class DownloadStage<OutType>(DownloadEnumerable<OutType> download) : IDownloadStage<OutType> {
private IAsyncEnumerable<Ordered<OutType>> _download = download;
public DownloadEnumerable<OutType> AsAsyncEnumerable() {
@@ -22,16 +22,16 @@ internal sealed class DownloadStage<RawType, OutType>(DownloadEnumerable<OutType
}
}
public IDownloadStage<RawType, OutType> SaveToDirectory(string dir) {
public IDownloadStage<OutType> SaveToDirectory(string dir) {
_download = _SaveToDirectory(dir);
return this;
}
public IDownloadStage<RawType, OutType> SaveToFiles(IEnumerable<string> files) {
public IDownloadStage<OutType> SaveToFiles(IEnumerable<string> files) {
throw new NotImplementedException();
}
public IDownloadStage<RawType, OutType> SaveToMemory(ConcurrentBag<OutType> bag) {
public IDownloadStage<OutType> SaveToMemory(ConcurrentBag<OutType> bag) {
throw new NotImplementedException();
}
+7 -7
View File
@@ -8,22 +8,22 @@ using Beam.Models;
namespace Beam.Fluent;
public static class FluentDownload {
public static ITransformStage<RawType, OutType> Links<RawType, OutType>(params IEnumerable<string> links) where RawType : IDocument {
return new TransformStage<RawType, OutType>(new DownloadContextBuilder<RawType>()
public static ITransformStage<OutType> Links<OutType>(params IEnumerable<string> links) {
return new TransformStage<OutType>(new DownloadContextBuilder()
.WithLinks(links));
}
public static ITransformStage<RawType, OutType>
ResourceDefinition<RawType, OutType>(ResourceDefinition definition) where RawType : IDocument {
public static ITransformStage< OutType>
ResourceDefinition< OutType>(ResourceDefinition definition) {
if (definition.Location.States.Count == 0)
throw new ArgumentException(Exceptions.Exceptions.resource_definition_invalid_states_count, nameof(definition));
var linkGenerator = new OrderedLinkGenerator(definition.Location.Segments, (NumberedStateChanger)definition.Location.StateChanger.Behavior,
definition.Location.States.First().Copy());
return new TransformStage<RawType, OutType>(new DownloadContextBuilder<RawType>()
return new TransformStage< OutType>(new DownloadContextBuilder()
.WithLinks(StringEnumerable.FromGenerator(linkGenerator!)));
}
public static ITransformStage<RawType, OutType> FromContext<RawType, OutType>(DownloadContext<RawType> existing) where RawType : IDocument {
return new TransformStage<RawType, OutType>(DownloadContextBuilder<RawType>.FromContext(existing));
public static ITransformStage< OutType> FromContext< OutType>(DownloadContext existing) {
return new TransformStage< OutType>(DownloadContextBuilder.FromContext(existing));
}
}
+3 -3
View File
@@ -6,8 +6,8 @@ using Beam.Models;
namespace Beam.Fluent;
internal sealed class TransformStage<RawType, OutType>(DownloadContextBuilder<RawType> CtxBuilder) : ITransformStage<RawType, OutType> where RawType : IDocument {
public IContextStage<RawType, OutType> WithTransformer(AsyncTransformer<RawType, OutType> transformer) {
return new ContextStage<RawType, OutType>(CtxBuilder, transformer);
internal sealed class TransformStage<OutType>(DownloadContextBuilder CtxBuilder) : ITransformStage<OutType> {
public IContextStage<OutType> WithTransformer(AsyncTransformer<ByteDocument, OutType> transformer) {
return new ContextStage<OutType>(CtxBuilder, transformer);
}
}
+28
View File
@@ -0,0 +1,28 @@
namespace Beam.Models;
public enum DownloadTarget {
/// <summary>
/// Specifies the target as the object directly returned through accessing the URL (whole page).
/// </summary>
/// <remarks>
/// Default to this mode where possible.
/// </remarks>
URL,
/// <summary>
/// Specifies the target as an object accessible only through the url (element in page).
/// </summary>
/// <remarks>
/// Only use this mode if what is needed
/// cannot be acquired by using <see cref="DownloadTarget.URL"/>
/// </remarks>
InURL,
/// <summary>
/// Specifies the target as an object that may be retrieved through a user-defined operation on the url
/// (e.g. javascript triggered downloads).
/// </summary>
/// <remarks>
/// Only use this mode if what is needed cannot be acquired by either
/// <see cref="URL"/> or <see cref="InURL"/>
/// </remarks>
Complex
}
+5
View File
@@ -0,0 +1,5 @@
using System.Diagnostics.CodeAnalysis;
namespace Beam.Models;
public delegate bool SkipPredicate<T>(string link, [NotNullWhen(true)] out T defaultValue);
+16 -21
View File
@@ -1,36 +1,31 @@
using Beam.Abstractions;
using Beam.Downloaders;
using Beam.Models;
using Beam.Playwright.Strategies;
using Microsoft.Playwright;
namespace Beam.Playwright {
public class PlaywrightUnitDownloader<RawType, OutType>(
UnitDownloaderOptions<RawType, OutType> options,
PlaywrightAsyncManipulator puppetManipulator)
: UnitDownloader<RawType, OutType>(options)
where RawType : IDocument {
public PlaywrightAsyncManipulator PuppetManipulator { get; } = puppetManipulator;
public class PlaywrightUnitDownloader<OutType> : UnitDownloader<OutType> {
public PlaywrightUnitDownloader(UnitDownloaderOptions<OutType> options,
PlaywrightAsyncManipulator puppetManipulator) : base(options) {
PuppetManipulator = puppetManipulator;
_downloadStrategy = options.Target switch {
DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(),
DownloadTarget.Complex => new WaitingDownloadStrategy(),
_ => throw new NotSupportedException() // TODO add an exception message
};
}
public PlaywrightAsyncManipulator PuppetManipulator { get; }
private IDownloadStrategy _downloadStrategy { get; }
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, CancellationToken ct) {
var page = await PlaywrightContext.Browser.Value.NewPageAsync();
try {
await page.GotoAsync(url);
await PuppetManipulator(page);
var download = await page.WaitForDownloadAsync();
await using var stream = await download.CreateReadStreamAsync();
var buffer = new byte[bufferSize];
var inBuffer = 0;
var downloaded = 0;
while ((inBuffer = stream.Read(buffer)) > 0) {
downloaded += inBuffer;
progress?.Report(new DownloadReport() {
BytesDownloaded = downloaded,
BytesRemaining = stream.Length - downloaded
});
await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct);
}
await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, page, ct);
} finally {
if (!page.IsClosed)
await page.CloseAsync();
@@ -0,0 +1,9 @@
using Beam.Abstractions;
using Microsoft.Playwright;
namespace Beam.Playwright.Strategies;
internal interface IDownloadStrategy {
Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
IProgress<IDownloadReport> progress, IPage page, CancellationToken ct);
}
@@ -0,0 +1,15 @@
using System.Text;
using Beam.Abstractions;
using Microsoft.Playwright;
namespace Beam.Playwright.Strategies;
internal class PageDownloadStrategy : IDownloadStrategy {
public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, IPage page,
CancellationToken ct) {
var source = await page.InnerHTMLAsync("html", new PageInnerHTMLOptions() { Strict = false });
var bytes = Encoding.UTF8.GetBytes(source);
await destinationStream.WriteAsync(bytes, ct);
}
}
@@ -0,0 +1,25 @@
using Beam.Abstractions;
using Beam.Models;
using Microsoft.Playwright;
namespace Beam.Playwright.Strategies;
internal class WaitingDownloadStrategy : IDownloadStrategy {
public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, IPage page,
CancellationToken ct) {
var download = await page.WaitForDownloadAsync();
await using var stream = await download.CreateReadStreamAsync();
var buffer = new byte[bufferSize];
var inBuffer = 0;
var downloaded = 0;
while ((inBuffer = stream.Read(buffer)) > 0) {
downloaded += inBuffer;
progress?.Report(new DownloadReport() {
BytesDownloaded = downloaded,
BytesRemaining = stream.Length - downloaded
});
await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct);
}
}
}
+2 -2
View File
@@ -9,7 +9,7 @@ using Beam.Downloaders;
using Beam.Models;
namespace Beam.Stealth {
public class StealthFragmentDownloader<RawType, OutType> : UnitFragmentDownloader<RawType, OutType> where RawType : IDocument {
public StealthFragmentDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader<RawType, OutType>(options, config, manipulator)) {}
public class StealthFragmentDownloader<OutType> : UnitFragmentDownloader<OutType> {
public StealthFragmentDownloader(UnitDownloaderOptions<OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader<OutType>(options, config, manipulator)) {}
}
}
@@ -1,16 +0,0 @@
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Beam.Abstractions;
using Beam.Downloaders;
using Beam.Models;
namespace Beam.Stealth {
public class StealthFragmentPageDownloader<RawType, OutType> : UnitFragmentDownloader<RawType, OutType> where RawType : IDocument {
public StealthFragmentPageDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitPageDownloader<RawType, OutType>(options, config, manipulator)) {}
}
}
+12 -72
View File
@@ -9,18 +9,27 @@ using System.Threading.Tasks;
using Beam.Abstractions;
using Beam.Downloaders;
using Beam.Models;
using Beam.Stealth.Strategies;
namespace Beam.Stealth {
using File = System.IO.File;
public class StealthUnitDownloader<RawType, OutType> : UnitDownloader<RawType, OutType> where RawType : IDocument {
public class StealthUnitDownloader<OutType> : UnitDownloader<OutType> {
public StealthConfig Config { get; }
public StealthAsyncManipulator Manipulator { get; }
private ILogger? Logger => Config.Logger;
public StealthUnitDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) {
private IDownloadStrategy _downloadStrategy { get; }
public StealthUnitDownloader(UnitDownloaderOptions<OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) {
Config = config;
Manipulator = manipulator;
_downloadStrategy = options.Target switch {
DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(),
DownloadTarget.Complex => new WaitingDownloadStrategy(),
_ => throw new NotSupportedException() // TODO add an exception message
};
}
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
@@ -29,76 +38,7 @@ namespace Beam.Stealth {
await driver.Navigate().GoToUrlAsync(url);
await Manipulator(driver);
await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), ct);
await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask);
}
/* --------------------------------------------------------------------- */
private async Task<Stream?> WaitForDownloadAsync(
string link, IProgress<IDownloadReport> progress, Stopwatch sw, CancellationToken ct) {
const int PollDelayMs = 250; // how often we look
const int StableDelayMs = 1000; // size-unchanged window
string dir = Config.DownloadsDirectory;
string? finalPath = null;
long lastSize = -1;
DateTime lastChange = DateTime.UtcNow;
bool IsTemp(string p) =>
p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) ||
p.EndsWith(".part", StringComparison.OrdinalIgnoreCase);
Logger?.LogDebug("Polling {Dir} for download files", dir);
while (sw.Elapsed < Config.TimeOut && !ct.IsCancellationRequested) {
// current files in the directory
var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray();
// ignore temp names; pick (or re-pick) the first real candidate
finalPath ??= files.FirstOrDefault(f => !IsTemp(f));
// still nothing but temps keep waiting
if (finalPath is null) {
await Task.Delay(PollDelayMs, ct);
continue;
}
// track growth
long size = new FileInfo(finalPath).Length;
if (size == 0 || size != lastSize) {
progress?.Report(new DownloadReport() {
BytesDownloaded = size - lastSize,
});
lastSize = size;
lastChange = DateTime.UtcNow;
await Task.Delay(PollDelayMs, ct);
continue;
}
// size stable long enough *and* no temp files left?
bool tempsRemain = files.Any(IsTemp);
if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) {
await Task.Delay(PollDelayMs, ct);
continue;
}
// wait until writer releases lock
while (true) {
try {
using FileStream _ =
File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None);
break;
} catch (IOException) {
await Task.Delay(200, ct);
}
}
return File.OpenRead(finalPath);
}
Logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed);
return null;
await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, Config, Logger, ct);
}
-33
View File
@@ -1,33 +0,0 @@
using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Beam.Abstractions;
using Beam.Downloaders;
using Beam.Models;
namespace Beam.Stealth {
public class StealthUnitPageDownloader<RawType, OutType> : UnitDownloader<RawType, OutType> where RawType : IDocument {
public StealthConfig Config { get; }
public StealthAsyncManipulator Manipulator { get; }
private ILogger? Logger => Config.Logger;
public StealthUnitPageDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) {
Config = config;
Manipulator = manipulator;
}
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, CancellationToken ct) {
var driver = Config.Driver;
await driver.Navigate().GoToUrlAsync(url);
await Manipulator(driver);
byte[] bytes = Encoding.UTF8.GetBytes(driver.PageSource);
await destinationStream.WriteAsync(bytes, ct);
}
}
}
@@ -0,0 +1,9 @@
using Beam.Abstractions;
using Microsoft.Extensions.Logging;
namespace Beam.Stealth.Strategies;
internal interface IDownloadStrategy {
Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
IProgress<IDownloadReport> progress, StealthConfig config, ILogger? logger, CancellationToken ct);
}
@@ -0,0 +1,13 @@
using System.Text;
using Beam.Abstractions;
using Microsoft.Extensions.Logging;
namespace Beam.Stealth.Strategies;
internal class PageDownloadStrategy : IDownloadStrategy {
public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, StealthConfig config,
ILogger? logger, CancellationToken ct) {
byte[] bytes = Encoding.UTF8.GetBytes(config.Driver.PageSource);
await destinationStream.WriteAsync(bytes, ct);
}
}
@@ -0,0 +1,83 @@
using System.Diagnostics;
using Beam.Abstractions;
using Beam.Models;
using Microsoft.Extensions.Logging;
using File = System.IO.File;
namespace Beam.Stealth.Strategies;
public class WaitingDownloadStrategy : IDownloadStrategy {
public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, StealthConfig config,
ILogger? logger, CancellationToken ct) {
await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), config, logger, ct);
await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask);
}
private async Task<Stream?> WaitForDownloadAsync(
string link, IProgress<IDownloadReport> progress, Stopwatch sw, StealthConfig config, ILogger? logger, CancellationToken ct) {
const int PollDelayMs = 250; // how often we look
const int StableDelayMs = 1000; // size-unchanged window
string dir = config.DownloadsDirectory;
string? finalPath = null;
long lastSize = -1;
DateTime lastChange = DateTime.UtcNow;
bool IsTemp(string p) =>
p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) ||
p.EndsWith(".part", StringComparison.OrdinalIgnoreCase);
logger?.LogDebug("Polling {Dir} for download files", dir);
while (sw.Elapsed < config.TimeOut && !ct.IsCancellationRequested) {
// current files in the directory
var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray();
// ignore temp names; pick (or re-pick) the first real candidate
finalPath ??= files.FirstOrDefault(f => !IsTemp(f));
// still nothing but temps keep waiting
if (finalPath is null) {
await Task.Delay(PollDelayMs, ct);
continue;
}
// track growth
long size = new FileInfo(finalPath).Length;
if (size == 0 || size != lastSize) {
progress?.Report(new DownloadReport() {
BytesDownloaded = size - lastSize,
});
lastSize = size;
lastChange = DateTime.UtcNow;
await Task.Delay(PollDelayMs, ct);
continue;
}
// size stable long enough *and* no temp files left?
bool tempsRemain = files.Any(IsTemp);
if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) {
await Task.Delay(PollDelayMs, ct);
continue;
}
// wait until writer releases lock
while (true) {
try {
using FileStream _ =
File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None);
break;
} catch (IOException) {
await Task.Delay(200, ct);
}
}
return File.OpenRead(finalPath);
}
logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed);
return null;
}
}