diff --git a/Beam.Downloaders/Beam.Downloaders.csproj b/Beam.Downloaders/Beam.Downloaders.csproj index ab69f30..1526f93 100644 --- a/Beam.Downloaders/Beam.Downloaders.csproj +++ b/Beam.Downloaders/Beam.Downloaders.csproj @@ -14,6 +14,7 @@ + diff --git a/Beam.Downloaders/DownloadContext.cs b/Beam.Downloaders/DownloadContext.cs index 96009af..59e0ab1 100644 --- a/Beam.Downloaders/DownloadContext.cs +++ b/Beam.Downloaders/DownloadContext.cs @@ -8,14 +8,14 @@ namespace Beam.Downloaders { //public delegate Task AsyncHtmlTransformer(HtmlDocument doc); //public delegate Task AsyncBinaryTransformer(byte[] bin); - public class DownloadContext { + public class DownloadContext { private bool disposedValue; public HttpClient Client { get; } public HtmlWeb Web { get; } public IProgress? DownloadReporter { get; set; } public IProgress? RetryReporter { get; set; } - public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; } + public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; } public TimeSpan TimeOut { get; set; } public IEnumerable Links { get; } public CancellationToken CancellationToken { get; } @@ -28,7 +28,7 @@ namespace Beam.Downloaders { CancellationToken cancellationToken = default, IProgress? downloadReporter = null, IProgress? retryReporter = null, - AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null, + AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null, TimeSpan? timeOut = null, ILogger? downloadLogger = null) { ArgumentNullException.ThrowIfNull(web, nameof(web)); diff --git a/Beam.Downloaders/DownloadContextBuilder.cs b/Beam.Downloaders/DownloadContextBuilder.cs index d92277a..41f5e13 100644 --- a/Beam.Downloaders/DownloadContextBuilder.cs +++ b/Beam.Downloaders/DownloadContextBuilder.cs @@ -5,12 +5,12 @@ using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class DownloadContextBuilder { + public class DownloadContextBuilder { private HtmlWeb _web; private HttpClient _client; private IProgress? _downloadReporter; private IProgress? _retryReporter; - private AsyncDownloadFailurePredicate?[] _asyncFailurePredicates = []; + private AsyncDownloadFailurePredicate?[] _asyncFailurePredicates = []; private TimeSpan _timeOut; private IEnumerable _links; private CancellationToken _cancellationToken; @@ -26,60 +26,60 @@ namespace Beam.Downloaders { _links = []; } - public DownloadContextBuilder WithWeb(HtmlWeb web) { + public DownloadContextBuilder WithWeb(HtmlWeb web) { _web = web; return this; } - public DownloadContextBuilder WithClient(HttpClient client) { + public DownloadContextBuilder WithClient(HttpClient client) { _client = client; return this; } - public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) { + public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) { _downloadReporter = downloadReporter; return this; } - public DownloadContextBuilder WithRetryReporter(IProgress retryReporter) { + public DownloadContextBuilder WithRetryReporter(IProgress retryReporter) { _retryReporter = retryReporter; return this; } - public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate[] predicates) { + public DownloadContextBuilder WithAsyncFailurePredicates(params AsyncDownloadFailurePredicate[] predicates) { _asyncFailurePredicates = predicates; return this; } - public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) { + public DownloadContextBuilder WithTimeOut(TimeSpan timeOut) { _timeOut = timeOut; return this; } - public DownloadContextBuilder WithLinks(IEnumerable links) { + public DownloadContextBuilder WithLinks(IEnumerable links) { _links = links; return this; } - public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) { + public DownloadContextBuilder WithCancellationToken(CancellationToken cancellationToken) { _cancellationToken = cancellationToken; return this; } - public DownloadContextBuilder WithCache(DocumentCache cache) { + public DownloadContextBuilder WithCache(DocumentCache cache) { _cache = cache; return this; } - public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) { + public DownloadContextBuilder WithDownloadLogger(ILogger downloadLogger) { _downloadLogger = downloadLogger; return this; } - public DownloadContext Build() { + public DownloadContext Build() { // Construct the DownloadContext using the collected values. - var context = new DownloadContext( + var context = new DownloadContext( web: _web, client: _client, links: _links, @@ -100,15 +100,15 @@ namespace Beam.Downloaders { return context; } - public static DownloadContextBuilder FromContext(DownloadContext existing) { + public static DownloadContextBuilder FromContext(DownloadContext existing) { if (existing == null) throw new ArgumentNullException(nameof(existing)); - return new DownloadContextBuilder(existing.Client, existing.Web) + return new DownloadContextBuilder(existing.Client, existing.Web) .WithLinks(existing.Links) .WithCancellationToken(existing.CancellationToken) .WithDownloadReporter(existing.DownloadReporter!) .WithRetryReporter(existing.RetryReporter!) - .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>()) + .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>()) .WithTimeOut(existing.TimeOut) .WithDownloadLogger(existing.DownloadLogger!) .WithCache(existing.Cache); diff --git a/Beam.Downloaders/FailurePredicateOptions.cs b/Beam.Downloaders/FailurePredicateOptions.cs new file mode 100644 index 0000000..e804577 --- /dev/null +++ b/Beam.Downloaders/FailurePredicateOptions.cs @@ -0,0 +1,9 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public record class FailurePredicateOptions { + public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; } + public bool ProcessInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} \ No newline at end of file diff --git a/Beam.Downloaders/FailurePredicateOptionsBuilder.cs b/Beam.Downloaders/FailurePredicateOptionsBuilder.cs new file mode 100644 index 0000000..d5ec367 --- /dev/null +++ b/Beam.Downloaders/FailurePredicateOptionsBuilder.cs @@ -0,0 +1,56 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public sealed class FailurePredicateOptionsBuilder +{ + private readonly System.Collections.Generic.List?> _predicates = + new System.Collections.Generic.List?>(); + private bool _processInParallel = false; + private int? _parallelThreads = null; + + public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate) + { + _predicates.Add(predicate); + return this; + } + + public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates) + { + if (predicates == null) throw new System.ArgumentNullException(nameof(predicates)); + _predicates.AddRange(predicates); + return this; + } + + public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates) + { + _predicates.Clear(); + if (predicates != null) _predicates.AddRange(predicates); + return this; + } + + public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true) + { + _processInParallel = value; + return this; + } + + public FailurePredicateOptionsBuilder WithParallelThreads(int? threads) + { + if (threads.HasValue && threads.Value <= 0) + throw new System.ArgumentOutOfRangeException(nameof(threads)); + _parallelThreads = threads; + return this; + } + + public FailurePredicateOptions Build() + { + var arr = _predicates.Count == 0 ? [] : _predicates.ToArray(); + return new FailurePredicateOptions + { + AsyncDownloadFailurePredicates = arr, + ProcessInParallel = _processInParallel, + ParallelThreads = _parallelThreads + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/FragmentOptions.cs b/Beam.Downloaders/FragmentOptions.cs new file mode 100644 index 0000000..a00c698 --- /dev/null +++ b/Beam.Downloaders/FragmentOptions.cs @@ -0,0 +1,7 @@ +namespace Beam.Downloaders; + +public record class FragmentOptions { + public required int FragmentSize { get; init; } + public bool DownloadInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} \ No newline at end of file diff --git a/Beam.Downloaders/FragmentOptionsBuilder.cs b/Beam.Downloaders/FragmentOptionsBuilder.cs new file mode 100644 index 0000000..49ea521 --- /dev/null +++ b/Beam.Downloaders/FragmentOptionsBuilder.cs @@ -0,0 +1,36 @@ +namespace Beam.Downloaders; + +public sealed class FragmentOptionsBuilder { + private int? _fragmentSize; + private bool _downloadInParallel = false; + private int? _parallelThreads = null; + + public FragmentOptionsBuilder WithFragmentSize(int bytes) { + if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); + _fragmentSize = bytes; + return this; + } + + public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) { + _downloadInParallel = value; + return this; + } + + public FragmentOptionsBuilder WithParallelThreads(int? threads) { + if (threads.HasValue && threads.Value <= 0) + throw new System.ArgumentOutOfRangeException(nameof(threads)); + _parallelThreads = threads; + return this; + } + + public FragmentOptions Build() { + if (!_fragmentSize.HasValue) + throw new System.InvalidOperationException("FragmentSize must be provided."); + + return new FragmentOptions { + FragmentSize = _fragmentSize.Value, + DownloadInParallel = _downloadInParallel, + ParallelThreads = _parallelThreads + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/SequentialDownloader.cs b/Beam.Downloaders/SequentialDownloader.cs index 378000b..2f7db22 100644 --- a/Beam.Downloaders/SequentialDownloader.cs +++ b/Beam.Downloaders/SequentialDownloader.cs @@ -3,9 +3,9 @@ using Beam.Models; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class SequentialDownloader : IAsyncEnumerator { + public class SequentialDownloader : IAsyncEnumerator { public OutType Current { get; protected set; } - public DownloadContext Context { get; } + public DownloadContext Context { get; } public ILogger? Logger { get; set; } public int LastOrder { get; set; } = 0; @@ -13,7 +13,7 @@ namespace Beam.Downloaders { public Func> GetUnitDownloader { get; set; } - public SequentialDownloader(DownloadContext context, Func, IUnitDownloader> getUnitDownloader, ILogger? logger = null) { + public SequentialDownloader(DownloadContext context, Func> getUnitDownloader, ILogger? logger = null) { Context = context; Logger = logger; LinksEnumerator = Context.Links.GetEnumerator(); diff --git a/Beam.Downloaders/SequentialFragmentDownloader.cs b/Beam.Downloaders/SequentialFragmentDownloader.cs index b4c40d5..a7a1274 100644 --- a/Beam.Downloaders/SequentialFragmentDownloader.cs +++ b/Beam.Downloaders/SequentialFragmentDownloader.cs @@ -3,10 +3,10 @@ using Beam.Models; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class SequentialFragmentDownloader : SequentialDownloader>> { + public class SequentialFragmentDownloader : SequentialDownloader>> { public SequentialFragmentDownloader( - DownloadContext context, - Func, IUnitDownloader>>> getUnitDownloader, + DownloadContext context, + Func>>> getUnitDownloader, ILogger? logger = null) : base(context, getUnitDownloader, logger) {} } diff --git a/Beam.Downloaders/SkipPredicateOptions.cs b/Beam.Downloaders/SkipPredicateOptions.cs new file mode 100644 index 0000000..2955cf7 --- /dev/null +++ b/Beam.Downloaders/SkipPredicateOptions.cs @@ -0,0 +1,48 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public class SkipPredicateOptions { + public required SkipPredicate?[]? SkipPredicates { get; init; } + public bool ProcessInParallel { get; init; } = false; + public int? ParallelThreads { get; init; } +} + +public class SkipPredicateOptionsBuilder { + private List?> _skipPredicates { get; set; } = []; + private bool _processInParallel { get; set; } = false; + private int? _parallelThreads { get; set; } + + public SkipPredicateOptionsBuilder WithSkipPredicate(SkipPredicate predicate, bool replace=false) { + if (replace) + _skipPredicates.Clear(); + _skipPredicates.Add(predicate); + return this; + } + + public SkipPredicateOptionsBuilder WithSkipPredicates(SkipPredicate[] predicates, + bool replace = true) { + if (replace) + _skipPredicates.Clear(); + _skipPredicates.AddRange(predicates); + return this; + } + + public SkipPredicateOptionsBuilder ProcessInParallel(bool processInParallel = true) { + _processInParallel = processInParallel; + return this; + } + + public SkipPredicateOptionsBuilder WithParallelThreads(int parallelThreads) { + _parallelThreads = parallelThreads; + return this; + } + + public SkipPredicateOptions Build() { + return new SkipPredicateOptions() { + SkipPredicates = _skipPredicates.ToArray(), + ParallelThreads = _parallelThreads, + ProcessInParallel = _processInParallel + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/UnitDownloader.cs b/Beam.Downloaders/UnitDownloader.cs index 400f47a..cad1ccc 100644 --- a/Beam.Downloaders/UnitDownloader.cs +++ b/Beam.Downloaders/UnitDownloader.cs @@ -1,4 +1,6 @@ -using Beam.Abstractions; +using System.Diagnostics.CodeAnalysis; +using System.Text; +using Beam.Abstractions; using Beam.Models; using HtmlAgilityPack; using File = System.IO.File; @@ -11,12 +13,12 @@ namespace Beam.Downloaders { /// /// /// - public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader where RawType : IDocument { - public UnitDownloaderOptions Options { get; } = options; + public class UnitDownloader(UnitDownloaderOptions options) : IUnitDownloader { + public UnitDownloaderOptions Options { get; } = options; public HttpClient Client => Options.Client; - public virtual AsyncTransformer Transformer => Options.AsyncTransformer; - - public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates => + public virtual AsyncTransformer Transformer => Options.AsyncTransformer; + + public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates => Options?.FailurePredicateOptions?.AsyncDownloadFailurePredicates; public int LinksPerDownload { get; } = 1; @@ -70,7 +72,7 @@ namespace Beam.Downloaders { return new ByteDocument(url, bytes); } - protected virtual async Task IsFailure(RawType doc, CancellationToken ct) { + protected virtual async Task IsFailure(ByteDocument doc, CancellationToken ct) { if (FailurePredicates is null) return false; if (!(Options?.FailurePredicateOptions?.ProcessInParallel ?? false)) @@ -103,19 +105,18 @@ namespace Beam.Downloaders { return false; } - protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) { - if (Options.DownloadFolder is not null && this is UnitDownloader) { - var path = Path.Combine(Options.DownloadFolder, Path.GetRandomFileName()); + protected virtual async Task _Download(string link, IProgress progress, CancellationToken ct) { + if (Options.DownloadFolder is not null) { + var path = Path.Combine(Options.DownloadFolder, options.GetFileNameForDownload(link, [])); await DownloadToFile(link, Options.BufferSize, path, progress, ct); - return (RawType)(object)new StringDocument(link, path); + return new ByteDocument(link, Encoding.UTF8.GetBytes(path)); } - if (this is UnitDownloader) { - return (RawType)(object)(await DownloadToMemory(link, Options.BufferSize, progress, ct)); + else { + return await DownloadToMemory(link, Options.BufferSize, progress, ct); } - throw new NotSupportedException(Exceptions.Exceptions.unit_downloader_limited_support); } - protected virtual async Task<(bool, OutType?)> Transform(RawType download, CancellationToken ct) { + protected virtual async Task<(bool, OutType?)> Transform(ByteDocument download, CancellationToken ct) { try { if (FailurePredicates is null || !(await IsFailure(download, ct))) return (true, await Transformer(download)); @@ -131,6 +132,9 @@ namespace Beam.Downloaders { return (false, default); downProgress ??= new Progress(); + + if (ShouldSkip(link[0].Data, out var defaultType)) + return (true, defaultType); OutType? ot = default; int tryCount = 0; @@ -147,5 +151,41 @@ namespace Beam.Downloaders { return (false, ot); } + + private bool ShouldSkip(string link, [NotNullWhen(true)] out OutType? outType) { + outType = default; + if (Options.SkipPredicateOptions?.SkipPredicates is null) + return false; + if (!Options.SkipPredicateOptions.ProcessInParallel) + foreach (var pred in Options.SkipPredicateOptions.SkipPredicates) { + if (pred is null) + continue; + if (pred(link, out outType)) + return true; + } + else { + var shouldSkip = false; + OutType? _outType = default; + Parallel.ForEach(Options.SkipPredicateOptions.SkipPredicates, new ParallelOptions() { + MaxDegreeOfParallelism = Options?.FailurePredicateOptions?.ParallelThreads ?? 4 + }, + (predicate, parallelLoopState) => { + if (parallelLoopState.ShouldExitCurrentIteration) + return; + if (predicate == null) + return; + if (predicate(link, out var _innerLoopOutType)) { + Interlocked.CompareExchange(ref shouldSkip, true, false); + Interlocked.CompareExchange(ref _outType, _innerLoopOutType, default); + parallelLoopState.Break(); + } + } + ); + outType = _outType; + return shouldSkip; + } + + return false; + } } } diff --git a/Beam.Downloaders/UnitDownloaderOptions.cs b/Beam.Downloaders/UnitDownloaderOptions.cs index dc99555..421c3b6 100644 --- a/Beam.Downloaders/UnitDownloaderOptions.cs +++ b/Beam.Downloaders/UnitDownloaderOptions.cs @@ -1,198 +1,38 @@ +using System.Diagnostics.CodeAnalysis; +using System.Security.Cryptography; +using System.Text; using Beam.Models; namespace Beam.Downloaders; -public record class UnitDownloaderOptions { +public record class UnitDownloaderOptions { public HttpClient Client { get; init; } = new(); + + public DownloadTarget Target { get; init; } = DownloadTarget.URL; - public FailurePredicateOptions? FailurePredicateOptions { get; init; } + public SkipPredicateOptions? SkipPredicateOptions { get; init; } + public FailurePredicateOptions? FailurePredicateOptions { get; init; } public FragmentOptions? FragmentOptions { get; init; } - public required AsyncTransformer AsyncTransformer { get; init; } + public required AsyncTransformer AsyncTransformer { get; init; } + + /// + /// The location where the download is stored. + /// + /// + /// If not defined, UnitDownloader.TryDownload() downloads to memory. + /// public string? DownloadFolder { get; init; } = null; public int BufferSize { get; init; } = 80 * 1024; // 80kb + + public string GetFileNameForDownload(string url, byte[] additionalData) { + byte[] bytes = [..Encoding.UTF8.GetBytes(url), ..additionalData]; + var name = Convert.ToBase64String(System.IO.Hashing.XxHash64.Hash(bytes)); + return name.Replace('+', '-').Replace('/', '_').Replace('=', ' ').Trim(); + } } -public record class FailurePredicateOptions { - public required AsyncDownloadFailurePredicate?[]? AsyncDownloadFailurePredicates { get; init; } - public bool ProcessInParallel { get; init; } = false; - public int? ParallelThreads { get; init; } -} +// ---------- UnitDownloaderOptions Builder ---------- -public record class FragmentOptions { - public required int FragmentSize { get; init; } - public bool DownloadInParallel { get; init; } = false; - public int? ParallelThreads { get; init; } -} +// ---------- FailurePredicateOptions Builder ---------- - - // ---------- UnitDownloaderOptions Builder ---------- - public sealed class UnitDownloaderOptionsBuilder - { - private HttpClient _client = new HttpClient(); - private FailurePredicateOptions? _failureOptions; - private FragmentOptions? _fragmentOptions; - private AsyncTransformer? _asyncTransformer; - private string? _downloadFolder = null; - private int _bufferSize = 80 * 1024; - - public UnitDownloaderOptionsBuilder WithClient(HttpClient client) - { - _client = client ?? throw new System.ArgumentNullException(nameof(client)); - return this; - } - - public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options) - { - _failureOptions = options; - return this; - } - - public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure) - { - if (configure == null) throw new System.ArgumentNullException(nameof(configure)); - var b = new FailurePredicateOptionsBuilder(); - configure(b); - _failureOptions = b.Build(); - return this; - } - - public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options) - { - _fragmentOptions = options; - return this; - } - - public UnitDownloaderOptionsBuilder WithFragments(System.Action configure) - { - if (configure == null) throw new System.ArgumentNullException(nameof(configure)); - var b = new FragmentOptionsBuilder(); - configure(b); - _fragmentOptions = b.Build(); - return this; - } - - public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer) - { - _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer)); - return this; - } - - public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder) - { - _downloadFolder = downloadFolder; - return this; - } - - public UnitDownloaderOptionsBuilder WithBufferSize(int bytes) - { - if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); - _bufferSize = bytes; - return this; - } - - public UnitDownloaderOptions Build() - { - if (_asyncTransformer == null) - throw new System.InvalidOperationException("AsyncTransformer must be provided."); - - return new UnitDownloaderOptions - { - Client = _client, - FailurePredicateOptions = _failureOptions, - FragmentOptions = _fragmentOptions, - AsyncTransformer = _asyncTransformer, - DownloadFolder = _downloadFolder, - BufferSize = _bufferSize - }; - } - } - - // ---------- FailurePredicateOptions Builder ---------- - public sealed class FailurePredicateOptionsBuilder - { - private readonly System.Collections.Generic.List?> _predicates = - new System.Collections.Generic.List?>(); - private bool _processInParallel = false; - private int? _parallelThreads = null; - - public FailurePredicateOptionsBuilder WithPredicate(AsyncDownloadFailurePredicate? predicate) - { - _predicates.Add(predicate); - return this; - } - - public FailurePredicateOptionsBuilder WithPredicates(System.Collections.Generic.IEnumerable?> predicates) - { - if (predicates == null) throw new System.ArgumentNullException(nameof(predicates)); - _predicates.AddRange(predicates); - return this; - } - - public FailurePredicateOptionsBuilder WithPredicates(params AsyncDownloadFailurePredicate?[] predicates) - { - _predicates.Clear(); - if (predicates != null) _predicates.AddRange(predicates); - return this; - } - - public FailurePredicateOptionsBuilder WithProcessInParallel(bool value = true) - { - _processInParallel = value; - return this; - } - - public FailurePredicateOptionsBuilder WithParallelThreads(int? threads) - { - if (threads.HasValue && threads.Value <= 0) - throw new System.ArgumentOutOfRangeException(nameof(threads)); - _parallelThreads = threads; - return this; - } - - public FailurePredicateOptions Build() - { - var arr = _predicates.Count == 0 ? [] : _predicates.ToArray(); - return new FailurePredicateOptions - { - AsyncDownloadFailurePredicates = arr, - ProcessInParallel = _processInParallel, - ParallelThreads = _parallelThreads - }; - } - } - - // ---------- FragmentOptions Builder ---------- - public sealed class FragmentOptionsBuilder { - private int? _fragmentSize; - private bool _downloadInParallel = false; - private int? _parallelThreads = null; - - public FragmentOptionsBuilder WithFragmentSize(int bytes) { - if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); - _fragmentSize = bytes; - return this; - } - - public FragmentOptionsBuilder WithDownloadInParallel(bool value = true) { - _downloadInParallel = value; - return this; - } - - public FragmentOptionsBuilder WithParallelThreads(int? threads) { - if (threads.HasValue && threads.Value <= 0) - throw new System.ArgumentOutOfRangeException(nameof(threads)); - _parallelThreads = threads; - return this; - } - - public FragmentOptions Build() { - if (!_fragmentSize.HasValue) - throw new System.InvalidOperationException("FragmentSize must be provided."); - - return new FragmentOptions { - FragmentSize = _fragmentSize.Value, - DownloadInParallel = _downloadInParallel, - ParallelThreads = _parallelThreads - }; - } - } +// ---------- FragmentOptions Builder ---------- \ No newline at end of file diff --git a/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs b/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs new file mode 100644 index 0000000..4675092 --- /dev/null +++ b/Beam.Downloaders/UnitDownloaderOptionsBuilder.cs @@ -0,0 +1,107 @@ +using Beam.Models; + +namespace Beam.Downloaders; + +public sealed class UnitDownloaderOptionsBuilder { + private DownloadTarget _target = DownloadTarget.URL; + private HttpClient _client = new HttpClient(); + private FailurePredicateOptionsBuilder _failureOptionsBuilder = new(); + private FailurePredicateOptions? _failurePredicateOptionsOverride = null; + private SkipPredicateOptionsBuilder _skipPredicateOptionsBuilder = new(); + private SkipPredicateOptions? _skipPredicateOptionsOverride = null; + private FragmentOptions? _fragmentOptions; + private AsyncTransformer? _asyncTransformer; + private string? _downloadFolder = null; + private int _bufferSize = 80 * 1024; + + public UnitDownloaderOptionsBuilder WithTarget(DownloadTarget target) { + _target = target; + return this; + } + + public UnitDownloaderOptionsBuilder WithClient(HttpClient client) + { + _client = client ?? throw new System.ArgumentNullException(nameof(client)); + return this; + } + + public UnitDownloaderOptionsBuilder WithFailurePredicateOptions(FailurePredicateOptions? options) + { + _failurePredicateOptionsOverride = options; + return this; + } + + public UnitDownloaderOptionsBuilder WithFailurePredicates(System.Action> configure) + { + if (configure == null) throw new System.ArgumentNullException(nameof(configure)); + configure(_failureOptionsBuilder); + return this; + } + + public UnitDownloaderOptionsBuilder WithFragmentOptions(FragmentOptions? options) + { + _fragmentOptions = options; + return this; + } + + public UnitDownloaderOptionsBuilder WithSkipPredicates(Action> configure) { + if (configure == null) throw new ArgumentNullException(nameof(configure)); + configure(_skipPredicateOptionsBuilder); + return this; + } + + public UnitDownloaderOptionsBuilder WithSkipPredicateOptions( + SkipPredicateOptions skipPredicateOptions) { + _skipPredicateOptionsOverride = skipPredicateOptions; + return this; + } + + public UnitDownloaderOptionsBuilder WithFragments(System.Action configure) + { + if (configure == null) throw new System.ArgumentNullException(nameof(configure)); + var b = new FragmentOptionsBuilder(); + configure(b); + _fragmentOptions = b.Build(); + return this; + } + + public UnitDownloaderOptionsBuilder WithAsyncTransformer(AsyncTransformer transformer) + { + _asyncTransformer = transformer ?? throw new System.ArgumentNullException(nameof(transformer)); + return this; + } + + public UnitDownloaderOptionsBuilder WithDownloadFolder(string? downloadFolder) + { + _downloadFolder = downloadFolder; + return this; + } + + public UnitDownloaderOptionsBuilder WithBufferSize(int bytes) + { + if (bytes <= 0) throw new System.ArgumentOutOfRangeException(nameof(bytes)); + _bufferSize = bytes; + return this; + } + + public UnitDownloaderOptions Build() + { + if (_asyncTransformer == null) + throw new System.InvalidOperationException("AsyncTransformer must be provided."); + + _failurePredicateOptionsOverride ??= _failureOptionsBuilder.Build(); + _skipPredicateOptionsOverride ??= _skipPredicateOptionsBuilder.Build(); + + return new UnitDownloaderOptions + { + Target = _target, + Client = _client, + FailurePredicateOptions = _failurePredicateOptionsOverride, + SkipPredicateOptions = _skipPredicateOptionsOverride, + FragmentOptions = _fragmentOptions, + AsyncTransformer = _asyncTransformer, + DownloadFolder = _downloadFolder, + BufferSize = _bufferSize + }; + } +} \ No newline at end of file diff --git a/Beam.Downloaders/UnitFragmentDownloader.cs b/Beam.Downloaders/UnitFragmentDownloader.cs index a399128..017fce4 100644 --- a/Beam.Downloaders/UnitFragmentDownloader.cs +++ b/Beam.Downloaders/UnitFragmentDownloader.cs @@ -5,12 +5,12 @@ using HtmlAgilityPack; using Microsoft.Extensions.Logging; namespace Beam.Downloaders { - public class UnitFragmentDownloader(UnitDownloaderOptions options, - IUnitDownloader? internalDownloader = null) : IUnitDownloader>> where RawType : IDocument { + public class UnitFragmentDownloader(UnitDownloaderOptions options, + IUnitDownloader? internalDownloader = null) : IUnitDownloader>> { - public UnitDownloaderOptions Options { get; } = options; + public UnitDownloaderOptions Options { get; } = options; public int LinksPerDownload { get; set; } - private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options); + private IUnitDownloader UnitDownloader { get; } = internalDownloader ?? new UnitDownloader(options); async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload(IOrdered[] link, CancellationToken ct, int maximumRetryCount, IProgress? downProgress, IProgress? tryProgress) { Fragment> fragment = new Fragment>(link.Length); diff --git a/Beam.Fluent/ContextStage.cs b/Beam.Fluent/ContextStage.cs index c387c04..1d86a7f 100644 --- a/Beam.Fluent/ContextStage.cs +++ b/Beam.Fluent/ContextStage.cs @@ -8,14 +8,14 @@ using Beam.Downloaders; namespace Beam.Fluent; -internal sealed class ContextStage : IContextStage where RawType : IDocument { - private readonly DownloadContextBuilder _ctxBuilder; - private readonly AsyncTransformer _transformer; +internal sealed class ContextStage : IContextStage { + private readonly DownloadContextBuilder _ctxBuilder; + private readonly AsyncTransformer _transformer; private FragmentMode _fragmentMode = FragmentMode.Single; private Channel _channel = Channel.Plain; private readonly ContentKind _contentKind; private int _parallelism = 4; - private UnitDownloaderOptionsBuilder _optionsBuilder = new(); + private UnitDownloaderOptionsBuilder _optionsBuilder = new(); // ──────────────── playwright ──────────────── private PlaywrightAsyncManipulator? _playwrightManipulator = null; @@ -27,8 +27,8 @@ internal sealed class ContextStage : IContextStage ctxBuilder, - AsyncTransformer transformer) { + public ContextStage(DownloadContextBuilder ctxBuilder, + AsyncTransformer transformer) { _ctxBuilder = ctxBuilder; _transformer = transformer; _contentKind = transformer switch { @@ -43,28 +43,28 @@ internal sealed class ContextStage : IContextStage Configure(Action> configure) { + public IContextStage Configure(Action configure) { configure(_ctxBuilder); return this; } - public IContextStage ConfigureUnitDownloaderOptions( - Action> configure) { + public IContextStage ConfigureUnitDownloaderOptions( + Action> configure) { configure(_optionsBuilder); return this; } - public IContextStage WithParallelism(int degree) { + public IContextStage< OutType> WithParallelism(int degree) { _parallelism = Math.Max(1, degree); return this; } - public IContextStage WithTimeout(TimeSpan timeout) { + public IContextStage< OutType> WithTimeout(TimeSpan timeout) { _ctxBuilder.WithTimeOut(timeout); return this; } - public IContextStage WithRetryReporter(IProgress reporter) { + public IContextStage< OutType> WithRetryReporter(IProgress reporter) { _ctxBuilder.WithRetryReporter(reporter); return this; } @@ -73,7 +73,7 @@ internal sealed class ContextStage : IContextStage /// /// - public IContextStage UseFragments() { + public IContextStage< OutType> UseFragments() { if (_playwrightManipulator is not null) _playwrightManipulator = null; if (_channel == Channel.Playwright) @@ -88,7 +88,7 @@ internal sealed class ContextStage : IContextStage /// The page manipulator /// - public IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator) { + public IContextStage< OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) { if (_fragmentMode == FragmentMode.Fragmented) _fragmentMode = FragmentMode.Single; if (_stealthManipulator is not null) @@ -99,7 +99,7 @@ internal sealed class ContextStage : IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) { + public IContextStage< OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) { if (_playwrightManipulator is not null) _playwrightManipulator = null; @@ -109,7 +109,7 @@ internal sealed class ContextStage : IContextStage context) { + private object ConstructUnitDownloader(DownloadContext context) { #region Utility functions T To(object? o) where T : class @@ -145,27 +145,19 @@ internal sealed class ContextStage : IContextStage new UnitFragmentDownloader(options), + => new UnitFragmentDownloader< OutType>(options), // ──────────────── single ──────────────── (Channel.Plain, FragmentMode.Single, _) - => new UnitDownloader(options), + => new UnitDownloader< OutType>(options), // ──────────────── single playwright ──────────────── (Channel.Playwright, FragmentMode.Single, _) - => new PlaywrightUnitDownloader(options, EnsureExists(_playwrightManipulator)), - // ──────────────── single stealth file ──────────────── - (Channel.Stealth, FragmentMode.Single, ContentKind.File) - => new StealthUnitPageDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), - // ──────────────── single stealth binary ──────────────── + => new PlaywrightUnitDownloader< OutType>(options, EnsureExists(_playwrightManipulator)), + // ──────────────── single stealth ──────────────── (Channel.Stealth, FragmentMode.Single, ContentKind.Binary) - => new StealthUnitDownloader(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), - // ──────────────── fragment stealth file ──────────────── - (Channel.Stealth, FragmentMode.Fragmented, ContentKind.File) - => new StealthFragmentPageDownloader(options, - EnsureExists(_stealthConfig), - EnsureExists(_stealthManipulator)), - // ──────────────── fragment stealth binary ──────────────── + => new StealthUnitDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), + // ──────────────── fragment stealth ──────────────── (Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary) - => new StealthFragmentDownloader(options, + => new StealthFragmentDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)), _ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern, @@ -173,14 +165,14 @@ internal sealed class ContextStage : IContextStage> ConstructDownloader(DownloadContext context) { - var copyOfContext = DownloadContextBuilder.FromContext(context).Build(); + private IAsyncEnumerator> ConstructDownloader(DownloadContext context) { + var copyOfContext = DownloadContextBuilder.FromContext(context).Build(); return _fragmentMode switch { - FragmentMode.Fragmented => new SequentialFragmentDownloader( + FragmentMode.Fragmented => new SequentialFragmentDownloader( copyOfContext, ctx => (IUnitDownloader>>)ConstructUnitDownloader(ctx), context.DownloadLogger).UnwrapFragmented(), - FragmentMode.Single => new SequentialDownloader( + FragmentMode.Single => new SequentialDownloader< OutType>( copyOfContext, ctx => (IUnitDownloader)ConstructUnitDownloader(ctx), context.DownloadLogger).WrapOrdered(), diff --git a/Beam.Fluent/Core/IContextStage.cs b/Beam.Fluent/Core/IContextStage.cs index 5ad4486..2be7bf0 100644 --- a/Beam.Fluent/Core/IContextStage.cs +++ b/Beam.Fluent/Core/IContextStage.cs @@ -6,15 +6,15 @@ using Beam.Stealth; namespace Beam.Fluent; -public interface IContextStage { - IContextStage Configure(Action> configure); - IContextStage WithParallelism(int degree); - IContextStage WithTimeout(TimeSpan timeout); - IContextStage WithRetryReporter(IProgress reporter); - IContextStage UseFragments(); - IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator); - IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config); - IContextStage ConfigureUnitDownloaderOptions( - Action> configure); +public interface IContextStage { + IContextStage Configure(Action configure); + IContextStage WithParallelism(int degree); + IContextStage WithTimeout(TimeSpan timeout); + IContextStage WithRetryReporter(IProgress reporter); + IContextStage UseFragments(); + IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator); + IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config); + IContextStage ConfigureUnitDownloaderOptions( + Action> configure); DownloadEnumerable Build(); } \ No newline at end of file diff --git a/Beam.Fluent/Core/IDownloadStage.cs b/Beam.Fluent/Core/IDownloadStage.cs index 4c2c6bc..bfd3543 100644 --- a/Beam.Fluent/Core/IDownloadStage.cs +++ b/Beam.Fluent/Core/IDownloadStage.cs @@ -2,10 +2,10 @@ namespace Beam.Fluent; -public interface IDownloadStage { - IDownloadStage SaveToDirectory(string dir); - IDownloadStage SaveToFiles(IEnumerable files); - IDownloadStage SaveToMemory(ConcurrentBag bag); +public interface IDownloadStage { + IDownloadStage SaveToDirectory(string dir); + IDownloadStage SaveToFiles(IEnumerable files); + IDownloadStage SaveToMemory(ConcurrentBag bag); void WaitForDownload(); Task WaitForDownloadAsync(); DownloadEnumerable AsAsyncEnumerable(); diff --git a/Beam.Fluent/Core/ITransformStage.cs b/Beam.Fluent/Core/ITransformStage.cs index 34e2ae5..2fd6637 100644 --- a/Beam.Fluent/Core/ITransformStage.cs +++ b/Beam.Fluent/Core/ITransformStage.cs @@ -3,6 +3,6 @@ using Beam.Models; namespace Beam.Fluent; -public interface ITransformStage { - IContextStage WithTransformer(AsyncTransformer factory); +public interface ITransformStage { + IContextStage WithTransformer(AsyncTransformer factory); } \ No newline at end of file diff --git a/Beam.Fluent/DownloadStage.cs b/Beam.Fluent/DownloadStage.cs index 4b3a896..43a1b3c 100644 --- a/Beam.Fluent/DownloadStage.cs +++ b/Beam.Fluent/DownloadStage.cs @@ -5,7 +5,7 @@ using Beam.Models; namespace Beam.Fluent; -internal sealed class DownloadStage(DownloadEnumerable download) : IDownloadStage where RawType : IDocument { +internal sealed class DownloadStage(DownloadEnumerable download) : IDownloadStage { private IAsyncEnumerable> _download = download; public DownloadEnumerable AsAsyncEnumerable() { @@ -22,16 +22,16 @@ internal sealed class DownloadStage(DownloadEnumerable SaveToDirectory(string dir) { + public IDownloadStage SaveToDirectory(string dir) { _download = _SaveToDirectory(dir); return this; } - public IDownloadStage SaveToFiles(IEnumerable files) { + public IDownloadStage SaveToFiles(IEnumerable files) { throw new NotImplementedException(); } - public IDownloadStage SaveToMemory(ConcurrentBag bag) { + public IDownloadStage SaveToMemory(ConcurrentBag bag) { throw new NotImplementedException(); } diff --git a/Beam.Fluent/FluentDownload.cs b/Beam.Fluent/FluentDownload.cs index bce657a..b092d4d 100644 --- a/Beam.Fluent/FluentDownload.cs +++ b/Beam.Fluent/FluentDownload.cs @@ -8,22 +8,22 @@ using Beam.Models; namespace Beam.Fluent; public static class FluentDownload { - public static ITransformStage Links(params IEnumerable links) where RawType : IDocument { - return new TransformStage(new DownloadContextBuilder() + public static ITransformStage Links(params IEnumerable links) { + return new TransformStage(new DownloadContextBuilder() .WithLinks(links)); } - public static ITransformStage - ResourceDefinition(ResourceDefinition definition) where RawType : IDocument { + public static ITransformStage< OutType> + ResourceDefinition< OutType>(ResourceDefinition definition) { if (definition.Location.States.Count == 0) throw new ArgumentException(Exceptions.Exceptions.resource_definition_invalid_states_count, nameof(definition)); var linkGenerator = new OrderedLinkGenerator(definition.Location.Segments, (NumberedStateChanger)definition.Location.StateChanger.Behavior, definition.Location.States.First().Copy()); - return new TransformStage(new DownloadContextBuilder() + return new TransformStage< OutType>(new DownloadContextBuilder() .WithLinks(StringEnumerable.FromGenerator(linkGenerator!))); } - public static ITransformStage FromContext(DownloadContext existing) where RawType : IDocument { - return new TransformStage(DownloadContextBuilder.FromContext(existing)); + public static ITransformStage< OutType> FromContext< OutType>(DownloadContext existing) { + return new TransformStage< OutType>(DownloadContextBuilder.FromContext(existing)); } } \ No newline at end of file diff --git a/Beam.Fluent/TransformStage.cs b/Beam.Fluent/TransformStage.cs index 953ee14..1e35b3b 100644 --- a/Beam.Fluent/TransformStage.cs +++ b/Beam.Fluent/TransformStage.cs @@ -6,8 +6,8 @@ using Beam.Models; namespace Beam.Fluent; -internal sealed class TransformStage(DownloadContextBuilder CtxBuilder) : ITransformStage where RawType : IDocument { - public IContextStage WithTransformer(AsyncTransformer transformer) { - return new ContextStage(CtxBuilder, transformer); +internal sealed class TransformStage(DownloadContextBuilder CtxBuilder) : ITransformStage { + public IContextStage WithTransformer(AsyncTransformer transformer) { + return new ContextStage(CtxBuilder, transformer); } } \ No newline at end of file diff --git a/Beam.Models/DownloadTarget.cs b/Beam.Models/DownloadTarget.cs new file mode 100644 index 0000000..adaf393 --- /dev/null +++ b/Beam.Models/DownloadTarget.cs @@ -0,0 +1,28 @@ +namespace Beam.Models; + +public enum DownloadTarget { + /// + /// Specifies the target as the object directly returned through accessing the URL (whole page). + /// + /// + /// Default to this mode where possible. + /// + URL, + /// + /// Specifies the target as an object accessible only through the url (element in page). + /// + /// + /// Only use this mode if what is needed + /// cannot be acquired by using + /// + InURL, + /// + /// Specifies the target as an object that may be retrieved through a user-defined operation on the url + /// (e.g. javascript triggered downloads). + /// + /// + /// Only use this mode if what is needed cannot be acquired by either + /// or + /// + Complex +} \ No newline at end of file diff --git a/Beam.Models/SkipPredicate.cs b/Beam.Models/SkipPredicate.cs new file mode 100644 index 0000000..d1a4144 --- /dev/null +++ b/Beam.Models/SkipPredicate.cs @@ -0,0 +1,5 @@ +using System.Diagnostics.CodeAnalysis; + +namespace Beam.Models; + +public delegate bool SkipPredicate(string link, [NotNullWhen(true)] out T defaultValue); diff --git a/Beam.Playwright/PlaywrightUnitDownloader.cs b/Beam.Playwright/PlaywrightUnitDownloader.cs index 8e203e8..e2cdba2 100644 --- a/Beam.Playwright/PlaywrightUnitDownloader.cs +++ b/Beam.Playwright/PlaywrightUnitDownloader.cs @@ -1,36 +1,31 @@ using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; +using Beam.Playwright.Strategies; using Microsoft.Playwright; namespace Beam.Playwright { - public class PlaywrightUnitDownloader( - UnitDownloaderOptions options, - PlaywrightAsyncManipulator puppetManipulator) - : UnitDownloader(options) - where RawType : IDocument { - public PlaywrightAsyncManipulator PuppetManipulator { get; } = puppetManipulator; + public class PlaywrightUnitDownloader : UnitDownloader { + public PlaywrightUnitDownloader(UnitDownloaderOptions options, + PlaywrightAsyncManipulator puppetManipulator) : base(options) { + PuppetManipulator = puppetManipulator; + _downloadStrategy = options.Target switch { + DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(), + DownloadTarget.Complex => new WaitingDownloadStrategy(), + _ => throw new NotSupportedException() // TODO add an exception message + }; + } + + public PlaywrightAsyncManipulator PuppetManipulator { get; } + private IDownloadStrategy _downloadStrategy { get; } + protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, CancellationToken ct) { var page = await PlaywrightContext.Browser.Value.NewPageAsync(); try { await page.GotoAsync(url); await PuppetManipulator(page); - var download = await page.WaitForDownloadAsync(); - - await using var stream = await download.CreateReadStreamAsync(); - var buffer = new byte[bufferSize]; - var inBuffer = 0; - var downloaded = 0; - while ((inBuffer = stream.Read(buffer)) > 0) { - downloaded += inBuffer; - progress?.Report(new DownloadReport() { - BytesDownloaded = downloaded, - BytesRemaining = stream.Length - downloaded - }); - await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct); - } - + await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, page, ct); } finally { if (!page.IsClosed) await page.CloseAsync(); diff --git a/Beam.Playwright/Strategies/IDownloadStrategy.cs b/Beam.Playwright/Strategies/IDownloadStrategy.cs new file mode 100644 index 0000000..e3c645d --- /dev/null +++ b/Beam.Playwright/Strategies/IDownloadStrategy.cs @@ -0,0 +1,9 @@ +using Beam.Abstractions; +using Microsoft.Playwright; + +namespace Beam.Playwright.Strategies; + +internal interface IDownloadStrategy { + Task DownloadToStream(string url, int bufferSize, Stream destinationStream, + IProgress progress, IPage page, CancellationToken ct); +} \ No newline at end of file diff --git a/Beam.Playwright/Strategies/PageDownloadStrategy.cs b/Beam.Playwright/Strategies/PageDownloadStrategy.cs new file mode 100644 index 0000000..d975407 --- /dev/null +++ b/Beam.Playwright/Strategies/PageDownloadStrategy.cs @@ -0,0 +1,15 @@ +using System.Text; +using Beam.Abstractions; +using Microsoft.Playwright; + +namespace Beam.Playwright.Strategies; + +internal class PageDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, IPage page, + CancellationToken ct) { + + var source = await page.InnerHTMLAsync("html", new PageInnerHTMLOptions() { Strict = false }); + var bytes = Encoding.UTF8.GetBytes(source); + await destinationStream.WriteAsync(bytes, ct); + } +} \ No newline at end of file diff --git a/Beam.Playwright/Strategies/WaitingDownloadStrategy.cs b/Beam.Playwright/Strategies/WaitingDownloadStrategy.cs new file mode 100644 index 0000000..55c901b --- /dev/null +++ b/Beam.Playwright/Strategies/WaitingDownloadStrategy.cs @@ -0,0 +1,25 @@ +using Beam.Abstractions; +using Beam.Models; +using Microsoft.Playwright; + +namespace Beam.Playwright.Strategies; + +internal class WaitingDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, IPage page, + CancellationToken ct) { + + var download = await page.WaitForDownloadAsync(); + await using var stream = await download.CreateReadStreamAsync(); + var buffer = new byte[bufferSize]; + var inBuffer = 0; + var downloaded = 0; + while ((inBuffer = stream.Read(buffer)) > 0) { + downloaded += inBuffer; + progress?.Report(new DownloadReport() { + BytesDownloaded = downloaded, + BytesRemaining = stream.Length - downloaded + }); + await destinationStream.WriteAsync(buffer.AsMemory(0, inBuffer), ct); + } + } +} \ No newline at end of file diff --git a/Beam.Stealth/StealthFragmentDownloader.cs b/Beam.Stealth/StealthFragmentDownloader.cs index d471165..cd2b29b 100644 --- a/Beam.Stealth/StealthFragmentDownloader.cs +++ b/Beam.Stealth/StealthFragmentDownloader.cs @@ -9,7 +9,7 @@ using Beam.Downloaders; using Beam.Models; namespace Beam.Stealth { - public class StealthFragmentDownloader : UnitFragmentDownloader where RawType : IDocument { - public StealthFragmentDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader(options, config, manipulator)) {} + public class StealthFragmentDownloader : UnitFragmentDownloader { + public StealthFragmentDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader(options, config, manipulator)) {} } } diff --git a/Beam.Stealth/StealthFragmentPageDownloader.cs b/Beam.Stealth/StealthFragmentPageDownloader.cs deleted file mode 100644 index 9d931f5..0000000 --- a/Beam.Stealth/StealthFragmentPageDownloader.cs +++ /dev/null @@ -1,16 +0,0 @@ -using HtmlAgilityPack; -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Beam.Abstractions; -using Beam.Downloaders; -using Beam.Models; - -namespace Beam.Stealth { - public class StealthFragmentPageDownloader : UnitFragmentDownloader where RawType : IDocument { - public StealthFragmentPageDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitPageDownloader(options, config, manipulator)) {} - } -} diff --git a/Beam.Stealth/StealthUnitDownloader.cs b/Beam.Stealth/StealthUnitDownloader.cs index 5a5ac27..b489c41 100644 --- a/Beam.Stealth/StealthUnitDownloader.cs +++ b/Beam.Stealth/StealthUnitDownloader.cs @@ -9,18 +9,27 @@ using System.Threading.Tasks; using Beam.Abstractions; using Beam.Downloaders; using Beam.Models; +using Beam.Stealth.Strategies; namespace Beam.Stealth { using File = System.IO.File; - public class StealthUnitDownloader : UnitDownloader where RawType : IDocument { + public class StealthUnitDownloader : UnitDownloader { public StealthConfig Config { get; } public StealthAsyncManipulator Manipulator { get; } private ILogger? Logger => Config.Logger; - public StealthUnitDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { + private IDownloadStrategy _downloadStrategy { get; } + + public StealthUnitDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { Config = config; Manipulator = manipulator; + + _downloadStrategy = options.Target switch { + DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(), + DownloadTarget.Complex => new WaitingDownloadStrategy(), + _ => throw new NotSupportedException() // TODO add an exception message + }; } protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, @@ -29,76 +38,7 @@ namespace Beam.Stealth { await driver.Navigate().GoToUrlAsync(url); await Manipulator(driver); - await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), ct); - await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask); - } - - /* --------------------------------------------------------------------- */ - - private async Task WaitForDownloadAsync( - string link, IProgress progress, Stopwatch sw, CancellationToken ct) { - const int PollDelayMs = 250; // how often we look - const int StableDelayMs = 1000; // size-unchanged window - - string dir = Config.DownloadsDirectory; - string? finalPath = null; - long lastSize = -1; - DateTime lastChange = DateTime.UtcNow; - - bool IsTemp(string p) => - p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) || - p.EndsWith(".part", StringComparison.OrdinalIgnoreCase); - - Logger?.LogDebug("Polling {Dir} for download files", dir); - - while (sw.Elapsed < Config.TimeOut && !ct.IsCancellationRequested) { - // current files in the directory - var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray(); - - // ignore temp names; pick (or re-pick) the first real candidate - finalPath ??= files.FirstOrDefault(f => !IsTemp(f)); - - // still nothing but temps – keep waiting - if (finalPath is null) { - await Task.Delay(PollDelayMs, ct); - continue; - } - - // track growth - long size = new FileInfo(finalPath).Length; - if (size == 0 || size != lastSize) { - progress?.Report(new DownloadReport() { - BytesDownloaded = size - lastSize, - }); - lastSize = size; - lastChange = DateTime.UtcNow; - await Task.Delay(PollDelayMs, ct); - continue; - } - - // size stable long enough *and* no temp files left? - bool tempsRemain = files.Any(IsTemp); - if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) { - await Task.Delay(PollDelayMs, ct); - continue; - } - - // wait until writer releases lock - while (true) { - try { - using FileStream _ = - File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None); - break; - } catch (IOException) { - await Task.Delay(200, ct); - } - } - - return File.OpenRead(finalPath); - } - - Logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed); - return null; + await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, Config, Logger, ct); } diff --git a/Beam.Stealth/StealthUnitPageDownloader.cs b/Beam.Stealth/StealthUnitPageDownloader.cs deleted file mode 100644 index 48d5e01..0000000 --- a/Beam.Stealth/StealthUnitPageDownloader.cs +++ /dev/null @@ -1,33 +0,0 @@ -using HtmlAgilityPack; -using Microsoft.Extensions.Logging; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Beam.Abstractions; -using Beam.Downloaders; -using Beam.Models; - -namespace Beam.Stealth { - public class StealthUnitPageDownloader : UnitDownloader where RawType : IDocument { - public StealthConfig Config { get; } - public StealthAsyncManipulator Manipulator { get; } - private ILogger? Logger => Config.Logger; - - public StealthUnitPageDownloader(UnitDownloaderOptions options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) { - Config = config; - Manipulator = manipulator; - } - - protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, CancellationToken ct) { - var driver = Config.Driver; - - await driver.Navigate().GoToUrlAsync(url); - await Manipulator(driver); - - byte[] bytes = Encoding.UTF8.GetBytes(driver.PageSource); - await destinationStream.WriteAsync(bytes, ct); - } - } -} diff --git a/Beam.Stealth/Strategies/IDownloadStrategy.cs b/Beam.Stealth/Strategies/IDownloadStrategy.cs new file mode 100644 index 0000000..e0b6ff7 --- /dev/null +++ b/Beam.Stealth/Strategies/IDownloadStrategy.cs @@ -0,0 +1,9 @@ +using Beam.Abstractions; +using Microsoft.Extensions.Logging; + +namespace Beam.Stealth.Strategies; + +internal interface IDownloadStrategy { + Task DownloadToStream(string url, int bufferSize, Stream destinationStream, + IProgress progress, StealthConfig config, ILogger? logger, CancellationToken ct); +} \ No newline at end of file diff --git a/Beam.Stealth/Strategies/PageDownloadStrategy.cs b/Beam.Stealth/Strategies/PageDownloadStrategy.cs new file mode 100644 index 0000000..b047d0b --- /dev/null +++ b/Beam.Stealth/Strategies/PageDownloadStrategy.cs @@ -0,0 +1,13 @@ +using System.Text; +using Beam.Abstractions; +using Microsoft.Extensions.Logging; + +namespace Beam.Stealth.Strategies; + +internal class PageDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, StealthConfig config, + ILogger? logger, CancellationToken ct) { + byte[] bytes = Encoding.UTF8.GetBytes(config.Driver.PageSource); + await destinationStream.WriteAsync(bytes, ct); + } +} \ No newline at end of file diff --git a/Beam.Stealth/Strategies/WaitingDownloadStrategy.cs b/Beam.Stealth/Strategies/WaitingDownloadStrategy.cs new file mode 100644 index 0000000..4c676ce --- /dev/null +++ b/Beam.Stealth/Strategies/WaitingDownloadStrategy.cs @@ -0,0 +1,83 @@ +using System.Diagnostics; +using Beam.Abstractions; +using Beam.Models; +using Microsoft.Extensions.Logging; +using File = System.IO.File; + +namespace Beam.Stealth.Strategies; + +public class WaitingDownloadStrategy : IDownloadStrategy { + public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress progress, StealthConfig config, + ILogger? logger, CancellationToken ct) { + await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), config, logger, ct); + await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask); + } + + + private async Task WaitForDownloadAsync( + string link, IProgress progress, Stopwatch sw, StealthConfig config, ILogger? logger, CancellationToken ct) { + const int PollDelayMs = 250; // how often we look + const int StableDelayMs = 1000; // size-unchanged window + + string dir = config.DownloadsDirectory; + string? finalPath = null; + long lastSize = -1; + DateTime lastChange = DateTime.UtcNow; + + bool IsTemp(string p) => + p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) || + p.EndsWith(".part", StringComparison.OrdinalIgnoreCase); + + logger?.LogDebug("Polling {Dir} for download files", dir); + + while (sw.Elapsed < config.TimeOut && !ct.IsCancellationRequested) { + // current files in the directory + var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray(); + + // ignore temp names; pick (or re-pick) the first real candidate + finalPath ??= files.FirstOrDefault(f => !IsTemp(f)); + + // still nothing but temps – keep waiting + if (finalPath is null) { + await Task.Delay(PollDelayMs, ct); + continue; + } + + // track growth + long size = new FileInfo(finalPath).Length; + if (size == 0 || size != lastSize) { + progress?.Report(new DownloadReport() { + BytesDownloaded = size - lastSize, + }); + lastSize = size; + lastChange = DateTime.UtcNow; + await Task.Delay(PollDelayMs, ct); + continue; + } + + // size stable long enough *and* no temp files left? + bool tempsRemain = files.Any(IsTemp); + if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) { + await Task.Delay(PollDelayMs, ct); + continue; + } + + // wait until writer releases lock + while (true) { + try { + using FileStream _ = + File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None); + break; + } catch (IOException) { + await Task.Delay(200, ct); + } + } + + return File.OpenRead(finalPath); + } + + logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed); + return null; + } + +} \ No newline at end of file