230 lines
11 KiB
C#
230 lines
11 KiB
C#
using Beam.Models;
|
|
using HtmlAgilityPack;
|
|
using Beam.Playwright;
|
|
using Beam.Stealth;
|
|
using Beam;
|
|
using Beam.Abstractions;
|
|
using Beam.Downloaders;
|
|
|
|
namespace Beam.Fluent;
|
|
|
|
internal sealed class ContextStage<RawType, OutType> : IContextStage<RawType, OutType> {
|
|
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
|
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
|
private FragmentMode _fragmentMode = FragmentMode.Single;
|
|
private Channel _channel = Channel.Plain;
|
|
private readonly ContentKind _contentKind;
|
|
private int _parallelism = 4;
|
|
|
|
// ──────────────── playwright ────────────────
|
|
private PlaywrightAsyncManipulator? _playwrightManipulator = null;
|
|
// ────────────────────────────────────────────
|
|
|
|
// ──────────────── stealth ───────────────────
|
|
private StealthAsyncManipulator? _stealthManipulator = null;
|
|
private StealthConfig? _stealthConfig = null;
|
|
// ────────────────────────────────────────────
|
|
|
|
|
|
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder,
|
|
AsyncTransformer<RawType, OutType> transformer) {
|
|
_ctxBuilder = ctxBuilder;
|
|
_transformer = transformer;
|
|
_contentKind = transformer switch {
|
|
AsyncTransformer<HtmlDocument, OutType> => ContentKind.Html,
|
|
AsyncTransformer<ByteDocument, OutType> => ContentKind.Binary,
|
|
_ => throw new ArgumentException(string.Format(Exceptions.Exceptions.fluent_unsupported_transformer,
|
|
transformer.GetType()
|
|
.AsUniqueName()))
|
|
};
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> Configure(Action<DownloadContextBuilder<RawType>> configure) {
|
|
configure(_ctxBuilder);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> WithParallelism(int degree) {
|
|
_parallelism = Math.Max(1, degree);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> WithTimeout(TimeSpan timeout) {
|
|
_ctxBuilder.WithTimeOut(timeout);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> WithRetryReporter(IProgress<IRetryReport> reporter) {
|
|
_ctxBuilder.WithRetryReporter(reporter);
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePlaywright(PlaywrightAsyncManipulator)"/>
|
|
/// </summary>
|
|
/// <returns></returns>
|
|
public IContextStage<RawType, OutType> UseFragments() {
|
|
if (_playwrightManipulator is not null)
|
|
_playwrightManipulator = null;
|
|
if (_channel == Channel.Playwright)
|
|
_channel = Channel.Plain;
|
|
|
|
_fragmentMode = FragmentMode.Fragmented;
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
|
/// </summary>
|
|
/// <param name="manipulator">The page manipulator</param>
|
|
/// <returns></returns>
|
|
public IContextStage<RawType, OutType> UsePlaywright(PlaywrightAsyncManipulator manipulator) {
|
|
if (_fragmentMode == FragmentMode.Fragmented)
|
|
_fragmentMode = FragmentMode.Single;
|
|
if (_stealthManipulator is not null)
|
|
_stealthManipulator = null;
|
|
|
|
_channel = Channel.Playwright;
|
|
_playwrightManipulator = manipulator;
|
|
return this;
|
|
}
|
|
|
|
public IContextStage<RawType, OutType> UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
|
|
if (_playwrightManipulator is not null)
|
|
_playwrightManipulator = null;
|
|
|
|
_channel = Channel.Stealth;
|
|
_stealthManipulator = manipulator;
|
|
_stealthConfig = config;
|
|
return this;
|
|
}
|
|
|
|
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
|
#region Utility functions
|
|
|
|
T To<T>(object? o) where T : class
|
|
=> (o as T) ??
|
|
throw new Exception(
|
|
string.Format(Exceptions.Exceptions.fluent_type_conversion_failure,
|
|
o?.GetType().AsUniqueName() ?? "null", typeof(T).AsUniqueName()));
|
|
|
|
AsyncTransformer<HtmlDocument, OutType> HtmlTransformer()
|
|
=> To<AsyncTransformer<HtmlDocument, OutType>>(_transformer);
|
|
|
|
AsyncTransformer<ByteDocument, OutType> ByteTransformer()
|
|
=> To<AsyncTransformer<ByteDocument, OutType>>(_transformer);
|
|
|
|
AsyncDownloadFailurePredicate<HtmlDocument>[] HtmlFailurePredicates()
|
|
=> To<AsyncDownloadFailurePredicate<HtmlDocument>[]>(context.AsyncFailurePredicates);
|
|
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] ByteFailurePredicates()
|
|
=> To<AsyncDownloadFailurePredicate<ByteDocument>[]>(context.AsyncFailurePredicates);
|
|
|
|
T EnsureExists<T>(T? o) where T : class
|
|
=> (o ?? throw new Exception(Exceptions.Exceptions.fluent_invalid_state));
|
|
|
|
#endregion
|
|
|
|
return (_channel, _fragmentMode, _contentKind) switch {
|
|
// ──────────────── fragmented HTML ────────────────
|
|
(Channel.Plain, FragmentMode.Fragmented, ContentKind.Html)
|
|
=> new UnitFragmentDownloader<OutType>(
|
|
context.Web,
|
|
HtmlTransformer(),
|
|
HtmlFailurePredicates(),
|
|
_parallelism,
|
|
context.DownloadLogger),
|
|
// ──────────────── fragmented binary ────────────────
|
|
(Channel.Plain, FragmentMode.Fragmented, ContentKind.Binary)
|
|
=> new UnitFragmentDownloaderBinary<OutType>(
|
|
context.Client,
|
|
ByteTransformer(),
|
|
ByteFailurePredicates(),
|
|
_parallelism,
|
|
context.DownloadLogger),
|
|
// ──────────────── single HTML ────────────────
|
|
(Channel.Plain, FragmentMode.Single, ContentKind.Html)
|
|
=> new UnitDownloader<OutType>(
|
|
context.Web,
|
|
HtmlTransformer(),
|
|
HtmlFailurePredicates()),
|
|
// ──────────────── single binary ────────────────
|
|
(Channel.Plain, FragmentMode.Single, ContentKind.Binary)
|
|
=> new UnitDownloaderBinary<OutType>(
|
|
context.Client,
|
|
ByteTransformer(),
|
|
ByteFailurePredicates()),
|
|
// ──────────────── single playwright binary ────────────────
|
|
(Channel.Playwright, FragmentMode.Single, ContentKind.Binary)
|
|
=> new PlaywrightUnitDownloader<OutType>(
|
|
context.Client,
|
|
EnsureExists(_playwrightManipulator),
|
|
ByteTransformer(),
|
|
ByteFailurePredicates()
|
|
),
|
|
// ──────────────── single playwrigt HTML ────────────────
|
|
(Channel.Playwright, FragmentMode.Single, ContentKind.Html)
|
|
=> new PlaywrightUnitPageDownloader<OutType>(
|
|
context.Web,
|
|
EnsureExists(_playwrightManipulator),
|
|
HtmlTransformer(),
|
|
HtmlFailurePredicates()),
|
|
// ──────────────── single stealth HTML ────────────────
|
|
(Channel.Stealth, FragmentMode.Single, ContentKind.Html)
|
|
=> new StealthUnitPageDownloader<OutType>(
|
|
context.Web,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator),
|
|
HtmlTransformer(),
|
|
HtmlFailurePredicates()),
|
|
// ──────────────── single stealth binary ────────────────
|
|
(Channel.Stealth, FragmentMode.Single, ContentKind.Binary)
|
|
=> new StealthUnitDownloader<OutType>(
|
|
context.Client,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator),
|
|
ByteTransformer(),
|
|
ByteFailurePredicates()),
|
|
// ──────────────── fragment stealth HTML ────────────────
|
|
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.Html)
|
|
=> new StealthFragmentPageDownloader<OutType>(
|
|
context.Web,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator),
|
|
HtmlTransformer(),
|
|
HtmlFailurePredicates()),
|
|
// ──────────────── fragment stealth binary ────────────────
|
|
(Channel.Stealth, FragmentMode.Fragmented, ContentKind.Binary)
|
|
=> new StealthFragmentDownloader<OutType>(
|
|
context.Client,
|
|
EnsureExists(_stealthConfig),
|
|
EnsureExists(_stealthManipulator),
|
|
ByteTransformer(),
|
|
ByteFailurePredicates()),
|
|
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
|
|
$"({_channel}, {_fragmentMode}, {_contentKind})")),
|
|
};
|
|
}
|
|
|
|
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
|
var copyOfContext = DownloadContextBuilder<RawType>.FromContext(context).Build();
|
|
return _fragmentMode switch {
|
|
FragmentMode.Fragmented => new SequentialFragmentDownloader<RawType, OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).UnwrapFragmented(),
|
|
FragmentMode.Single => new SequentialDownloader<RawType, OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).WrapOrdered(),
|
|
_ => throw new Exception(string.Format(Exceptions.Exceptions.fluent_unsupported_pattern,
|
|
$"{_fragmentMode}")),
|
|
};
|
|
}
|
|
|
|
public DownloadEnumerable<OutType> Build() {
|
|
var context = _ctxBuilder.Build();
|
|
var enumerable = new DownloadEnumerable<OutType>(ConstructDownloader(context));
|
|
return enumerable;
|
|
}
|
|
} |