feat: add Puppeteer integration for web downloads

This introduces a new Puppeteer-based mechanism for downloading
web content. It provides a flexible way to manipulate pages
during downloads, enhancing the ability to handle dynamic
content and improve the overall download process.
This commit is contained in:
qwsdcvghyu89
2025-06-25 13:42:24 +03:00
parent 2317db9d3f
commit 3baa31a7cc
5 changed files with 117 additions and 25 deletions
+16
View File
@@ -0,0 +1,16 @@
using Microsoft.Playwright;
namespace Beam.Puppeteer {
public delegate Task AsyncManipulator(IPage page);
public static class PuppetContext {
public static Lazy<IPlaywright> Playwright { get; set; }
public static Lazy<IBrowser> Browser { get; set; }
static PuppetContext() {
Playwright = new Lazy<IPlaywright>(() => Microsoft.Playwright.Playwright.CreateAsync().Result);
Browser = new Lazy<IBrowser>(() => Playwright.Value.Chromium.LaunchAsync().Result);
}
}
}
+39
View File
@@ -0,0 +1,39 @@
using Microsoft.Playwright;
namespace Beam.Puppeteer {
public class PuppetUnitDownloader<T> : UnitDownloaderBinary<T> {
public AsyncManipulator PuppetManipulator { get; }
public PuppetUnitDownloader(HttpClient client, AsyncManipulator puppetManipulator, AsyncTransformer<ByteDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<ByteDocument>[] asyncDownloadFailurePredicates)
: base(client, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
PuppetManipulator = puppetManipulator;
}
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
var page = await PuppetContext.Browser.Value.NewPageAsync();
try {
await page.GotoAsync(link);
await PuppetManipulator(page);
var download = await page.WaitForDownloadAsync();
using var stream = await download.CreateReadStreamAsync();
byte[] content = new byte[stream.Length];
await stream.ReadExactlyAsync(content, ct);
ByteDocument doc = new ByteDocument(download.SuggestedFilename, content);
if (FailurePredicates is not null && await IsFailure(doc))
return (false, default);
var transformed = await Transformer(doc);
return (true, transformed);
} catch (Exception) {
return (false, default);
} finally {
if (!page.IsClosed)
await page.CloseAsync();
}
}
}
}
@@ -3,22 +3,19 @@ using HtmlAgilityPack;
using Microsoft.Playwright;
namespace Beam.Puppeteer {
public class PuppetContext(IPlaywright playwright, IBrowser browser) {
public IPlaywright Playwright { get; set; } = playwright;
public IBrowser Browser { get; set; } = browser;
}
public class PuppetUnitPageDownloader<T> : UnitDownloader<T> {
public AsyncManipulator PuppetManipulator { get; }
public class PuppetUnitDownloader<T> : UnitDownloader<T> {
public PuppetContext Context { get; }
public PuppetUnitDownloader(PuppetContext pc, DownloadContext<T> context, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
: base(context.Web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
Context = pc;
public PuppetUnitPageDownloader(HtmlWeb web, AsyncManipulator puppetManipulator, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
: base(web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
PuppetManipulator = puppetManipulator;
}
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
var page = await Context.Browser.NewPageAsync();
var page = await PuppetContext.Browser.Value.NewPageAsync();
try {
await page.GotoAsync(link);
await PuppetManipulator(page);
var content = await page.ContentAsync();
await page.CloseAsync();
@@ -36,4 +33,5 @@ namespace Beam.Puppeteer {
}
}
}
}
+47 -9
View File
@@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using Beam.Puppeteer;
namespace Beam.Temporary.Cli {
/// <summary>
@@ -151,7 +152,8 @@ namespace Beam.Temporary.Cli {
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
private readonly AsyncTransformer<RawType, OutType> _transformer;
private int _parallelism = 4;
private bool useFragments = false;
private bool _useFragments = false;
private AsyncManipulator? _useManipulator = null;
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
_ctxBuilder = ctxBuilder;
@@ -178,15 +180,35 @@ namespace Beam.Temporary.Cli {
return this;
}
/// <summary>
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
/// </summary>
/// <returns></returns>
public IContextStage UseFragments() {
useFragments = true;
if (_useManipulator is not null)
_useManipulator = null;
_useFragments = true;
return this;
}
/// <summary>
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
/// </summary>
/// <param name="manipulator">The page manipulator</param>
/// <returns></returns>
public IContextStage UsePuppet(AsyncManipulator manipulator) {
if (_useFragments)
_useFragments = false;
_useManipulator = manipulator;
return this;
}
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
return (useFragments, _transformer, context.AsyncFailurePredicates) switch {
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
// ──────────────── fragmented HTML ────────────────
(true, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new UnitFragmentDownloader<OutType>(
context.Web,
@@ -195,7 +217,7 @@ namespace Beam.Temporary.Cli {
_parallelism,
context.DownloadLogger),
// ──────────────── fragmented binary ────────────────
(true, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new UnitFragmentDownloaderBinary<OutType>(
context.Client,
@@ -204,26 +226,42 @@ namespace Beam.Temporary.Cli {
_parallelism,
context.DownloadLogger),
// ──────────────── single HTML ────────────────
(false, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new UnitDownloader<OutType>(
context.Web,
asyncHtmlTransformer,
documentFailurePredicates),
// ──────────────── single binary ────────────────
(false, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new UnitDownloaderBinary<OutType>(
context.Client,
asyncBinaryTransformer,
responseFailurePredicates),
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
// ──────────────── single puppet binary ────────────────
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new PuppetUnitDownloader<OutType>(
context.Client,
manipulator,
asyncBinaryTransformer,
responseFailurePredicates),
// ──────────────── single puppet HTML ────────────────
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new PuppetUnitPageDownloader<OutType>(
context.Web,
manipulator,
asyncHtmlTransformer,
documentFailurePredicates),
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
};
}
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
var copyOfContext = context.CreateBuilder().Build();
return useFragments switch {
return _useFragments switch {
true => new SequentialFragmentDownloader<RawType, OutType>(
copyOfContext,
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
+6 -5
View File
@@ -20,13 +20,13 @@ namespace Beam {
public int LinksPerDownload { get; } = 1;
/// <summary>Runs all configured failure predicates in parallel on the raw HTTP response.</summary>
protected virtual async Task<bool> IsFailure(HttpResponseMessage response) {
protected virtual async Task<bool> IsFailure(ByteDocument response) {
if (FailurePredicates is null) return false;
var failed = false;
await Parallel.ForEachAsync(FailurePredicates, async (pred, ct) => {
if (failed || pred is null) return;
if (await pred(new ByteDocument(response.RequestMessage?.RequestUri?.AbsolutePath ?? "", await response.Content.ReadAsByteArrayAsync(ct))))
if (await pred(response))
failed = true;
});
return failed;
@@ -38,10 +38,11 @@ namespace Beam {
using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct);
if (!response.IsSuccessStatusCode) return (false, default);
if (await IsFailure(response)) return (false, default);
var bytes = await response.Content.ReadAsByteArrayAsync(ct);
return (true, await Transformer(new ByteDocument(link, bytes)));
var doc = new ByteDocument(link, bytes);
if (await IsFailure(doc)) return (false, default);
return (true, await Transformer(doc));
} catch {
return (false, default);
}