2958a26e4f
Replaces specialized binary and HTML downloaders with a generic, options-driven UnitDownloader and UnitFragmentDownloader pattern. Introduces UnitDownloaderOptions and builder classes for flexible configuration, updates interfaces and method signatures to support progress reporting, and removes redundant binary-specific classes. Updates Playwright and Stealth downloaders to use the new generic base, and adds improved error handling and reporting. Also updates dependency versions and makes minor API consistency improvements across the Fluent and Models layers.
197 lines
9.4 KiB
C#
197 lines
9.4 KiB
C#
using Beam.Models;
|
|
using HtmlAgilityPack;
|
|
using Beam.Playwright;
|
|
using Beam.Stealth;
|
|
using Beam;
|
|
using Beam.Abstractions;
|
|
using Beam.Downloaders;
|
|
|
|
namespace Beam.Fluent;
|
|
|
|
internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, OutType> where RawType : IDocument {
|
|
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
|
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
|
private FragmentMode _fragmentMode = FragmentMode.Single;
|
|
private Channel _channel = Channel.Plain;
|
|
private readonly ContentKind _contentKind;
|
|
private int _parallelism = 4;
|
|
private UnitDownloaderOptionsBuilder<RawType, OutType> _optionsBuilder = new();
|
|
|
|
// ──────────────── playwright ────────────────
|
|
private PlaywrightAsyncManipulator? _playwrightManipulator = null;
|
|
// ────────────────────────────────────────────
|
|
|
|
// ──────────────── stealth ───────────────────
|
|
private StealthAsyncManipulator? _stealthManipulator = null;
|
|
private StealthConfig? _stealthConfig = null;
|
|
// ────────────────────────────────────────────
|
|
|
|
|
|
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder,
|
|
AsyncTransformer<RawType, OutType> transformer) {
|
|
_ctxBuilder = ctxBuilder;
|
|
_transformer = transformer;
|
|
_contentKind = transformer switch {
|
|
AsyncTransformer<StringDocument, OutType> => ContentKind.File,
|
|
AsyncTransformer<ByteDocument, OutType> => ContentKind.Binary,
|
|
_ => throw new ArgumentException(string.Format(Exceptions.Exceptions.fluent_unsupported_transformer,
|
|
transformer.GetType()
|
|
.AsUniqueName()))
|
|
};
|
|
|
|
_optionsBuilder
|
|
.WithAsyncTransformer(_transformer);
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> Configure(Action<DownloadContextBuilder<RawType>> configure) {
|
|
configure(_ctxBuilder);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> ConfigureUnitDownloaderOptions(
|
|
Action<UnitDownloaderOptionsBuilder<RawType, OutType>> configure) {
|
|
configure(_optionsBuilder);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> WithParallelism(int degree) {
|
|
_parallelism = Math.Max(1, degree);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> WithTimeout(TimeSpan timeout) {
|
|
_ctxBuilder.WithTimeOut(timeout);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> WithRetryReporter(IProgress<IRetryReport> reporter) {
|
|
_ctxBuilder.WithRetryReporter(reporter);
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePlaywright(PlaywrightAsyncManipulator)"/>
|
|
/// </summary>
|
|
/// <returns></returns>
|
|
public IContextStage<RawType, OutType> UseFragments() {
|
|
if (_playwrightManipulator is not null)
|
|
_playwrightManipulator = null;
|
|
if (_channel == Channel.Playwright)
|
|
_channel = Channel.Plain;
|
|
|
|
_fragmentMode = FragmentMode.Fragmented;
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
|
/// </summary>
|
|
/// <param name="manipulator">The page manipulator</param>
|
|
/// <returns></returns>
|
|
public IContextStage<RawType, OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) {
|
|
if (_fragmentMode == FragmentMode.Fragmented)
|
|
_fragmentMode = FragmentMode.Single;
|
|
if (_stealthManipulator is not null)
|
|
_stealthManipulator = null;
|
|
|
|
_channel = Channel.Playwright;
|
|
_playwrightManipulator = manipulator;
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
|
|
if (_playwrightManipulator is not null)
|
|
_playwrightManipulator = null;
|
|
|
|
_channel = Channel.Stealth;
|
|
_stealthManipulator = manipulator;
|
|
_stealthConfig = config;
|
|
return this;
|
|
}
|
|
|
|
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
|
#region Utility functions
|
|
|
|
T To<T>(object? o) where T : class
|
|
=> (o as T) ??
|
|
throw new Exception(
|
|
string.Format(Exceptions.Exceptions.fluent_type_conversion_failure,
|
|
o?.GetType().AsUniqueName() ?? "null", typeof(T).AsUniqueName()));
|
|
|
|
AsyncTransformer<StringDocument, OutType> FileTransformer()
|
|
=> To<AsyncTransformer<StringDocument, OutType>>(_transformer);
|
|
|
|
AsyncTransformer<ByteDocument, OutType> ByteTransformer()
|
|
=> To<AsyncTransformer<ByteDocument, OutType>>(_transformer);
|
|
|
|
AsyncDownloadFailurePredicate<StringDocument>[] FileFailurePredicates()
|
|
=> To<AsyncDownloadFailurePredicate<StringDocument>[]>(context.AsyncFailurePredicates);
|
|
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] ByteFailurePredicates()
|
|
=> To<AsyncDownloadFailurePredicate<ByteDocument>[]>(context.AsyncFailurePredicates);
|
|
|
|
T EnsureExists<T>(T? o) where T : class
|
|
=> (o ?? throw new Exception(Exceptions.Exceptions.fluent_invalid_state));
|
|
|
|
#endregion
|
|
|
|
if (context.AsyncFailurePredicates is not null)
|
|
_optionsBuilder
|
|
.WithFailurePredicates(x => x.WithPredicates(context.AsyncFailurePredicates));
|
|
var options = _optionsBuilder
|
|
.WithClient(context.Client)
|
|
.Build();
|
|
|
|
return (_channel, _fragmentMode, _contentKind) switch {
|
|
// ──────────────── fragmented ────────────────
|
|
(Channel.Plain, FragmentMode.Fragmented, _)
|
|
=> new UnitFragmentDownloader<RawType, OutType>(options),
|
|
// ──────────────── single ────────────────
|
|
(Channel.Plain, FragmentMode.Single, _)
|
|
=> new UnitDownloader<RawType, OutType>(options),
|
|
// ──────────────── single playwright ────────────────
|
|
(Channel.Playwright, FragmentMode.Single, _)
|
|
=> new PlaywrightUnitDownloader<RawType, OutType>(options, EnsureExists(_playwrightManipulator)),
|
|
// ──────────────── single stealth file ────────────────
|
|
(Channel.Stealth, FragmentMode.Single, ContentKind.File)
|
|
=> new StealthUnitPageDownloader<RawType, OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
|
|
// ──────────────── single stealth binary ────────────────
|
|
(Channel.Stealth, FragmentMode.Single, ContentKind.Binary)
|
|
=> new StealthUnitDownloader<RawType, OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
|
|
// ──────────────── fragment stealth file ────────────────
|
|
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.File)
|
|
=> new StealthFragmentPageDownloader<RawType, OutType>(options,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator)),
|
|
// ──────────────── fragment stealth binary ────────────────
|
|
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary)
|
|
=> new StealthFragmentDownloader<RawType, OutType>(options,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator)),
|
|
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
|
|
$"({_channel}, {_fragmentMode}, {_contentKind})")),
|
|
};
|
|
}
|
|
|
|
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
|
var copyOfContext = DownloadContextBuilder<RawType>.FromContext(context).Build();
|
|
return _fragmentMode switch {
|
|
FragmentMode.Fragmented => new SequentialFragmentDownloader<RawType, OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).UnwrapFragmented(),
|
|
FragmentMode.Single => new SequentialDownloader<RawType, OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).WrapOrdered(),
|
|
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
|
|
$"{_fragmentMode}")),
|
|
};
|
|
}
|
|
|
|
public DownloadEnumerable<OutType> Build() {
|
|
var context = _ctxBuilder.Build();
|
|
var enumerable = new DownloadEnumerable<OutType>(ConstructDownloader(context));
|
|
return enumerable;
|
|
}
|
|
} |