7ed05abdb8
- Introduced modularity by splitting Beam into new projects: Beam.Abstractions, Beam.Models, and Beam.Downloaders. - Refactored existing classes into appropriate namespaces and projects. - Replaced specific implementations with abstractions (e.g., SourceLinkBuilder to LinkBuilder, State to IState, etc.). - Updated interfaces: added ITemplate, IArticleData, IDownloadReport, and others for improved extensibility. - Removed deprecated classes like SourceLinkBuilder and StateChangerFactory. - Enhanced link handling in downloaders by refactoring to use `string` over `SourceLink`. - Consolidated shared logic under Beam.Abstractions.
191 lines
11 KiB
C#
191 lines
11 KiB
C#
using Beam.Abstractions;
|
|
using Beam.Models;
|
|
using HtmlAgilityPack;
|
|
using Beam.Playwright;
|
|
using Beam.Stealth;
|
|
using Beam;
|
|
using Beam.Downloaders;
|
|
|
|
namespace Beam.Fluent {
|
|
public static partial class DownloadBuilder<RawType, OutType> {
|
|
private sealed class ContextStage : IContextStage {
|
|
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
|
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
|
private int _parallelism = 4;
|
|
private bool _useFragments = false;
|
|
private PlaywrightAsyncManipulator? _usePlaywrightManipulator = null;
|
|
private StealthAsyncManipulator? _useStealthManipulator = null;
|
|
private StealthConfig? _stealthConfig = null;
|
|
|
|
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
|
|
_ctxBuilder = ctxBuilder;
|
|
_transformer = transformer;
|
|
}
|
|
|
|
public IContextStage Configure(Action<DownloadContextBuilder<RawType>> configure) {
|
|
configure(_ctxBuilder);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage WithParallelism(int degree) {
|
|
_parallelism = Math.Max(1, degree);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage WithTimeout(TimeSpan timeout) {
|
|
_ctxBuilder.WithTimeOut(timeout);
|
|
return this;
|
|
}
|
|
|
|
public IContextStage WithRetryReporter(IProgress<IRetryReport> reporter) {
|
|
_ctxBuilder.WithRetryReporter(reporter);
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePlaywright(PlaywrightAsyncManipulator)"/>
|
|
/// </summary>
|
|
/// <returns></returns>
|
|
public IContextStage UseFragments() {
|
|
if (_usePlaywrightManipulator is not null)
|
|
_usePlaywrightManipulator = null;
|
|
|
|
_useFragments = true;
|
|
return this;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
|
/// </summary>
|
|
/// <param name="manipulator">The page manipulator</param>
|
|
/// <returns></returns>
|
|
public IContextStage UsePlaywright(PlaywrightAsyncManipulator manipulator) {
|
|
if (_useFragments)
|
|
_useFragments = false;
|
|
if (_useStealthManipulator is not null)
|
|
_useStealthManipulator = null;
|
|
|
|
_usePlaywrightManipulator = manipulator;
|
|
return this;
|
|
}
|
|
|
|
public IContextStage UseStealth(StealthAsyncManipulator manipulator, StealthConfig config) {
|
|
if (_usePlaywrightManipulator is not null)
|
|
_usePlaywrightManipulator = null;
|
|
|
|
_useStealthManipulator = manipulator;
|
|
_stealthConfig = config;
|
|
return this;
|
|
}
|
|
|
|
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
|
return (_useFragments, _useStealthManipulator, _usePlaywrightManipulator, _transformer, context.AsyncFailurePredicates) switch {
|
|
// ──────────────── fragmented HTML ────────────────
|
|
(true, null, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
|
=> new UnitFragmentDownloader<OutType>(
|
|
context.Web,
|
|
asyncHtmlTransformer,
|
|
documentFailurePredicates,
|
|
_parallelism,
|
|
context.DownloadLogger),
|
|
// ──────────────── fragmented binary ────────────────
|
|
(true, null, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
|
=> new UnitFragmentDownloaderBinary<OutType>(
|
|
context.Client,
|
|
asyncBinaryTransformer,
|
|
responseFailurePredicates,
|
|
_parallelism,
|
|
context.DownloadLogger),
|
|
// ──────────────── single HTML ────────────────
|
|
(false, null, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
|
=> new UnitDownloader<OutType>(
|
|
context.Web,
|
|
asyncHtmlTransformer,
|
|
documentFailurePredicates),
|
|
// ──────────────── single binary ────────────────
|
|
(false, null, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
|
=> new UnitDownloaderBinary<OutType>(
|
|
context.Client,
|
|
asyncBinaryTransformer,
|
|
responseFailurePredicates),
|
|
// ──────────────── single playwright binary ────────────────
|
|
(false, _, PlaywrightAsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
|
=> new PlaywrightUnitDownloader<OutType>(
|
|
context.Client,
|
|
manipulator,
|
|
asyncBinaryTransformer,
|
|
responseFailurePredicates),
|
|
// ──────────────── single playwrigt HTML ────────────────
|
|
(false, _, PlaywrightAsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
|
=> new PlaywrightUnitPageDownloader<OutType>(
|
|
context.Web,
|
|
manipulator,
|
|
asyncHtmlTransformer,
|
|
documentFailurePredicates),
|
|
// ──────────────── single stealth HTML ────────────────
|
|
(false, StealthAsyncManipulator manipulator, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
|
=> new StealthUnitPageDownloader<OutType>(
|
|
context.Web,
|
|
_stealthConfig ?? throw new Exception($"Stealth config is null"),
|
|
manipulator,
|
|
asyncHtmlTransformer,
|
|
documentFailurePredicates),
|
|
// ──────────────── single stealth binary ────────────────
|
|
(false, StealthAsyncManipulator manipulator, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
|
=> new StealthUnitDownloader<OutType>(
|
|
context.Client,
|
|
_stealthConfig ?? throw new Exception($"Stealth config is null"),
|
|
manipulator,
|
|
asyncBinaryTransformer,
|
|
responseFailurePredicates),
|
|
// ──────────────── fragment stealth HTML ────────────────
|
|
(true, StealthAsyncManipulator manipulator, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
|
AsyncDownloadFailurePredicate<HtmlDocument>[] downloadFailurePredicates)
|
|
=> new StealthFragmentPageDownloader<OutType>(
|
|
context.Web,
|
|
_stealthConfig ?? throw new Exception($"Stealth config is null"),
|
|
manipulator,
|
|
asyncHtmlTransformer),
|
|
// ──────────────── fragment stealth binary ────────────────
|
|
(true, StealthAsyncManipulator manipulator, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
|
=> new StealthFragmentDownloader<OutType>(
|
|
context.Client,
|
|
_stealthConfig ?? throw new Exception($"Stealth config is null"),
|
|
manipulator,
|
|
asyncBinaryTransformer),
|
|
_ => throw new Exception($"Unsupported transformer / failure-predicate combinatAsyion. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
|
};
|
|
}
|
|
|
|
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
|
var copyOfContext = DownloadContextBuilder<RawType>.FromContext(context).Build();
|
|
return _useFragments switch {
|
|
true => new SequentialFragmentDownloader<RawType, OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).UnwrapFragmented(),
|
|
false => new SequentialDownloader<RawType, OutType>(
|
|
copyOfContext,
|
|
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
|
|
context.DownloadLogger).WrapOrdered()
|
|
};
|
|
}
|
|
|
|
public DownloadEnumerable<OutType> Build() {
|
|
var context = _ctxBuilder.Build();
|
|
var enumerable = new DownloadEnumerable<OutType>(ConstructDownloader(context));
|
|
return enumerable;
|
|
}
|
|
}
|
|
}
|
|
}
|