f52aa6123b
Replaces generic RawType with ByteDocument in downloaders and context classes, simplifying type usage. Adds builder classes for FailurePredicateOptions, FragmentOptions, SkipPredicateOptions, and UnitDownloaderOptions to improve configuration flexibility. Introduces DownloadTarget enum and SkipPredicate delegate for more granular download control. Refactors Fluent API interfaces and implementations to remove RawType generics and streamline usage. Adds Playwright and Stealth download strategies for extensibility.
189 lines
8.4 KiB
C#
189 lines
8.4 KiB
C#
using Beam.Models;
|
|
using HtmlAgilityPack;
|
|
using Beam.Playwright;
|
|
using Beam.Stealth;
|
|
using Beam;
|
|
using Beam.Abstractions;
|
|
using Beam.Downloaders;
|
|
|
|
namespace Beam.Fluent;
|
|
|
|
internal sealed class ContextStage<OutType> : IContextStage<OutType> {
|
|
private readonly DownloadContextBuilder _ctxBuilder;
|
|
private readonly AsyncTransformer<ByteDocument, OutType> _transformer;
|
|
private FragmentMode _fragmentMode = FragmentMode.Single;
|
|
private Channel _channel = Channel.Plain;
|
|
private readonly ContentKind _contentKind;
|
|
private int _parallelism = 4;
|
|
private UnitDownloaderOptionsBuilder<OutType> _optionsBuilder = new();
|
|
|
|
// ──────────────── playwright ────────────────
|
|
private PlaywrightAsyncManipulator? _playwrightManipulator = null;
|
|
// ────────────────────────────────────────────
|
|
|
|
// ──────────────── stealth ───────────────────
|
|
private StealthAsyncManipulator? _stealthManipulator = null;
|
|
private StealthConfig? _stealthConfig = null;
|
|
// ────────────────────────────────────────────
|
|
|
|
|
|
public ContextStage(DownloadContextBuilder ctxBuilder,
|
|
AsyncTransformer<ByteDocument, OutType> transformer) {
|
|
_ctxBuilder = ctxBuilder;
|
|
_transformer = transformer;
|
|
_contentKind = transformer switch {
|
|
AsyncTransformer<StringDocument, OutType> => ContentKind.File,
|
|
AsyncTransformer<ByteDocument, OutType> => ContentKind.Binary,
|
|
_ => throw new ArgumentException(string.Format(Exceptions.Exceptions.fluent_unsupported_transformer,
|
|
transformer.GetType()
|
|
.AsUniqueName()))
|
|
};
|
|
|
|
_optionsBuilder
|
|
.WithAsyncTransformer(_transformer);
|
|
}
|
|
|
|
public IContextStage<OutType> Configure(Action<DownloadContextBuilder> configure) {
|
|
configure(_ctxBuilder);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<OutType> ConfigureUnitDownloaderOptions(
|
|
Action<UnitDownloaderOptionsBuilder< OutType>> configure) {
|
|
configure(_optionsBuilder);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage< OutType> WithParallelism(int degree) {
|
|
_parallelism = Math.Max(1, degree);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage< OutType> WithTimeout(TimeSpan timeout) {
|
|
_ctxBuilder.WithTimeOut(timeout);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage< OutType> WithRetryReporter(IProgress<IRetryReport> reporter) {
|
|
_ctxBuilder.WithRetryReporter(reporter);
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePlaywright(PlaywrightAsyncManipulator)"/>
|
|
/// </summary>
|
|
/// <returns></returns>
|
|
public IContextStage< OutType> UseFragments() {
|
|
if (_playwrightManipulator is not null)
|
|
_playwrightManipulator = null;
|
|
if (_channel == Channel.Playwright)
|
|
_channel = Channel.Plain;
|
|
|
|
_fragmentMode = FragmentMode.Fragmented;
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
|
/// </summary>
|
|
/// <param name="manipulator">The page manipulator</param>
|
|
/// <returns></returns>
|
|
public IContextStage< OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) {
|
|
if (_fragmentMode == FragmentMode.Fragmented)
|
|
_fragmentMode = FragmentMode.Single;
|
|
if (_stealthManipulator is not null)
|
|
_stealthManipulator = null;
|
|
|
|
_channel = Channel.Playwright;
|
|
_playwrightManipulator = manipulator;
|
|
return this;
|
|
}
|
|
|
|
public IContextStage< OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
|
|
if (_playwrightManipulator is not null)
|
|
_playwrightManipulator = null;
|
|
|
|
_channel = Channel.Stealth;
|
|
_stealthManipulator = manipulator;
|
|
_stealthConfig = config;
|
|
return this;
|
|
}
|
|
|
|
private object ConstructUnitDownloader(DownloadContext context) {
|
|
#region Utility functions
|
|
|
|
T To<T>(object? o) where T : class
|
|
=> (o as T) ??
|
|
throw new Exception(
|
|
string.Format(Exceptions.Exceptions.fluent_type_conversion_failure,
|
|
o?.GetType().AsUniqueName() ?? "null", typeof(T).AsUniqueName()));
|
|
|
|
AsyncTransformer<StringDocument, OutType> FileTransformer()
|
|
=> To<AsyncTransformer<StringDocument, OutType>>(_transformer);
|
|
|
|
AsyncTransformer<ByteDocument, OutType> ByteTransformer()
|
|
=> To<AsyncTransformer<ByteDocument, OutType>>(_transformer);
|
|
|
|
AsyncDownloadFailurePredicate<StringDocument>[] FileFailurePredicates()
|
|
=> To<AsyncDownloadFailurePredicate<StringDocument>[]>(context.AsyncFailurePredicates);
|
|
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] ByteFailurePredicates()
|
|
=> To<AsyncDownloadFailurePredicate<ByteDocument>[]>(context.AsyncFailurePredicates);
|
|
|
|
T EnsureExists<T>(T? o) where T : class
|
|
=> (o ?? throw new Exception(Exceptions.Exceptions.fluent_invalid_state));
|
|
|
|
#endregion
|
|
|
|
if (context.AsyncFailurePredicates is not null)
|
|
_optionsBuilder
|
|
.WithFailurePredicates(x => x.WithPredicates(context.AsyncFailurePredicates));
|
|
var options = _optionsBuilder
|
|
.WithClient(context.Client)
|
|
.Build();
|
|
|
|
return (_channel, _fragmentMode, _contentKind) switch {
|
|
// ──────────────── fragmented ────────────────
|
|
(Channel.Plain, FragmentMode.Fragmented, _)
|
|
=> new UnitFragmentDownloader< OutType>(options),
|
|
// ──────────────── single ────────────────
|
|
(Channel.Plain, FragmentMode.Single, _)
|
|
=> new UnitDownloader< OutType>(options),
|
|
// ──────────────── single playwright ────────────────
|
|
(Channel.Playwright, FragmentMode.Single, _)
|
|
=> new PlaywrightUnitDownloader< OutType>(options, EnsureExists(_playwrightManipulator)),
|
|
// ──────────────── single stealth ────────────────
|
|
(Channel.Stealth, FragmentMode.Single, ContentKind.Binary)
|
|
=> new StealthUnitDownloader< OutType>(options, EnsureExists(_stealthConfig), EnsureExists(_stealthManipulator)),
|
|
// ──────────────── fragment stealth ────────────────
|
|
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary)
|
|
=> new StealthFragmentDownloader< OutType>(options,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator)),
|
|
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
|
|
$"({_channel}, {_fragmentMode}, {_contentKind})")),
|
|
};
|
|
}
|
|
|
|
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext context) {
|
|
var copyOfContext = DownloadContextBuilder.FromContext(context).Build();
|
|
return _fragmentMode switch {
|
|
FragmentMode.Fragmented => new SequentialFragmentDownloader<OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).UnwrapFragmented(),
|
|
FragmentMode.Single => new SequentialDownloader< OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).WrapOrdered(),
|
|
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
|
|
$"{_fragmentMode}")),
|
|
};
|
|
}
|
|
|
|
public DownloadEnumerable<OutType> Build() {
|
|
var context = _ctxBuilder.Build();
|
|
var enumerable = new DownloadEnumerable<OutType>(ConstructDownloader(context));
|
|
return enumerable;
|
|
}
|
|
} |