feat: add Puppeteer integration for web downloads
This introduces a new Puppeteer-based mechanism for downloading web content. It provides a flexible way to manipulate pages during downloads, enhancing the ability to handle dynamic content and improve the overall download process.
This commit is contained in:
@@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using HtmlAgilityPack;
|
||||
using Beam.Puppeteer;
|
||||
|
||||
namespace Beam.Temporary.Cli {
|
||||
/// <summary>
|
||||
@@ -151,7 +152,8 @@ namespace Beam.Temporary.Cli {
|
||||
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
||||
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
||||
private int _parallelism = 4;
|
||||
private bool useFragments = false;
|
||||
private bool _useFragments = false;
|
||||
private AsyncManipulator? _useManipulator = null;
|
||||
|
||||
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
|
||||
_ctxBuilder = ctxBuilder;
|
||||
@@ -178,15 +180,35 @@ namespace Beam.Temporary.Cli {
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public IContextStage UseFragments() {
|
||||
useFragments = true;
|
||||
if (_useManipulator is not null)
|
||||
_useManipulator = null;
|
||||
|
||||
_useFragments = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
||||
/// </summary>
|
||||
/// <param name="manipulator">The page manipulator</param>
|
||||
/// <returns></returns>
|
||||
public IContextStage UsePuppet(AsyncManipulator manipulator) {
|
||||
if (_useFragments)
|
||||
_useFragments = false;
|
||||
|
||||
_useManipulator = manipulator;
|
||||
return this;
|
||||
}
|
||||
|
||||
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
||||
return (useFragments, _transformer, context.AsyncFailurePredicates) switch {
|
||||
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
|
||||
// ──────────────── fragmented HTML ────────────────
|
||||
(true, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||
=> new UnitFragmentDownloader<OutType>(
|
||||
context.Web,
|
||||
@@ -195,7 +217,7 @@ namespace Beam.Temporary.Cli {
|
||||
_parallelism,
|
||||
context.DownloadLogger),
|
||||
// ──────────────── fragmented binary ────────────────
|
||||
(true, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||
=> new UnitFragmentDownloaderBinary<OutType>(
|
||||
context.Client,
|
||||
@@ -204,26 +226,42 @@ namespace Beam.Temporary.Cli {
|
||||
_parallelism,
|
||||
context.DownloadLogger),
|
||||
// ──────────────── single HTML ────────────────
|
||||
(false, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||
=> new UnitDownloader<OutType>(
|
||||
context.Web,
|
||||
asyncHtmlTransformer,
|
||||
documentFailurePredicates),
|
||||
// ──────────────── single binary ────────────────
|
||||
(false, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||
=> new UnitDownloaderBinary<OutType>(
|
||||
context.Client,
|
||||
asyncBinaryTransformer,
|
||||
responseFailurePredicates),
|
||||
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
||||
// ──────────────── single puppet binary ────────────────
|
||||
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||
=> new PuppetUnitDownloader<OutType>(
|
||||
context.Client,
|
||||
manipulator,
|
||||
asyncBinaryTransformer,
|
||||
responseFailurePredicates),
|
||||
// ──────────────── single puppet HTML ────────────────
|
||||
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||
=> new PuppetUnitPageDownloader<OutType>(
|
||||
context.Web,
|
||||
manipulator,
|
||||
asyncHtmlTransformer,
|
||||
documentFailurePredicates),
|
||||
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
||||
};
|
||||
}
|
||||
|
||||
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
||||
var copyOfContext = context.CreateBuilder().Build();
|
||||
return useFragments switch {
|
||||
return _useFragments switch {
|
||||
true => new SequentialFragmentDownloader<RawType, OutType>(
|
||||
copyOfContext,
|
||||
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
||||
|
||||
Reference in New Issue
Block a user