feat: add Puppeteer integration for web downloads

This introduces a new Puppeteer-based mechanism for downloading
web content. It provides a flexible way to manipulate pages
during downloads, enhancing the ability to handle dynamic
content and improve the overall download process.
This commit is contained in:
qwsdcvghyu89
2025-06-25 13:42:24 +03:00
parent 2317db9d3f
commit 3baa31a7cc
5 changed files with 117 additions and 25 deletions
+47 -9
View File
@@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using Beam.Puppeteer;
namespace Beam.Temporary.Cli {
/// <summary>
@@ -151,7 +152,8 @@ namespace Beam.Temporary.Cli {
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
private readonly AsyncTransformer<RawType, OutType> _transformer;
private int _parallelism = 4;
private bool useFragments = false;
private bool _useFragments = false;
private AsyncManipulator? _useManipulator = null;
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
_ctxBuilder = ctxBuilder;
@@ -178,15 +180,35 @@ namespace Beam.Temporary.Cli {
return this;
}
/// <summary>
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
/// </summary>
/// <returns></returns>
public IContextStage UseFragments() {
useFragments = true;
if (_useManipulator is not null)
_useManipulator = null;
_useFragments = true;
return this;
}
/// <summary>
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
/// </summary>
/// <param name="manipulator">The page manipulator</param>
/// <returns></returns>
public IContextStage UsePuppet(AsyncManipulator manipulator) {
if (_useFragments)
_useFragments = false;
_useManipulator = manipulator;
return this;
}
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
return (useFragments, _transformer, context.AsyncFailurePredicates) switch {
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
// ──────────────── fragmented HTML ────────────────
(true, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new UnitFragmentDownloader<OutType>(
context.Web,
@@ -195,7 +217,7 @@ namespace Beam.Temporary.Cli {
_parallelism,
context.DownloadLogger),
// ──────────────── fragmented binary ────────────────
(true, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new UnitFragmentDownloaderBinary<OutType>(
context.Client,
@@ -204,26 +226,42 @@ namespace Beam.Temporary.Cli {
_parallelism,
context.DownloadLogger),
// ──────────────── single HTML ────────────────
(false, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new UnitDownloader<OutType>(
context.Web,
asyncHtmlTransformer,
documentFailurePredicates),
// ──────────────── single binary ────────────────
(false, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new UnitDownloaderBinary<OutType>(
context.Client,
asyncBinaryTransformer,
responseFailurePredicates),
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
// ──────────────── single puppet binary ────────────────
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new PuppetUnitDownloader<OutType>(
context.Client,
manipulator,
asyncBinaryTransformer,
responseFailurePredicates),
// ──────────────── single puppet HTML ────────────────
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new PuppetUnitPageDownloader<OutType>(
context.Web,
manipulator,
asyncHtmlTransformer,
documentFailurePredicates),
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
};
}
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
var copyOfContext = context.CreateBuilder().Build();
return useFragments switch {
return _useFragments switch {
true => new SequentialFragmentDownloader<RawType, OutType>(
copyOfContext,
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),