From 3baa31a7cc27abb365af212b5529dc0f196148fd Mon Sep 17 00:00:00 2001 From: qwsdcvghyu89 <61093706+qwsdcvghyu89@users.noreply.github.com> Date: Wed, 25 Jun 2025 13:42:24 +0300 Subject: [PATCH] feat: add Puppeteer integration for web downloads This introduces a new Puppeteer-based mechanism for downloading web content. It provides a flexible way to manipulate pages during downloads, enhancing the ability to handle dynamic content and improve the overall download process. --- Beam.Puppeteer/PuppetContext.cs | 16 ++++++ Beam.Puppeteer/PuppetUnitDownloader.cs | 39 +++++++++++++ ...nloader.cs => PuppetUnitPageDownloader.cs} | 20 +++---- Beam.Temporary.Cli/DownloadBuilder.cs | 56 ++++++++++++++++--- Beam/UnitDownloaderBinary.cs | 11 ++-- 5 files changed, 117 insertions(+), 25 deletions(-) create mode 100644 Beam.Puppeteer/PuppetContext.cs create mode 100644 Beam.Puppeteer/PuppetUnitDownloader.cs rename Beam.Puppeteer/{PuppetedUnitDownloader.cs => PuppetUnitPageDownloader.cs} (54%) diff --git a/Beam.Puppeteer/PuppetContext.cs b/Beam.Puppeteer/PuppetContext.cs new file mode 100644 index 0000000..3fdffce --- /dev/null +++ b/Beam.Puppeteer/PuppetContext.cs @@ -0,0 +1,16 @@ +using Microsoft.Playwright; + +namespace Beam.Puppeteer { + public delegate Task AsyncManipulator(IPage page); + + public static class PuppetContext { + public static Lazy Playwright { get; set; } + public static Lazy Browser { get; set; } + + static PuppetContext() { + Playwright = new Lazy(() => Microsoft.Playwright.Playwright.CreateAsync().Result); + Browser = new Lazy(() => Playwright.Value.Chromium.LaunchAsync().Result); + } + } + +} diff --git a/Beam.Puppeteer/PuppetUnitDownloader.cs b/Beam.Puppeteer/PuppetUnitDownloader.cs new file mode 100644 index 0000000..31d41b8 --- /dev/null +++ b/Beam.Puppeteer/PuppetUnitDownloader.cs @@ -0,0 +1,39 @@ +using Microsoft.Playwright; + +namespace Beam.Puppeteer { + public class PuppetUnitDownloader : UnitDownloaderBinary { + public AsyncManipulator PuppetManipulator { get; } + + public PuppetUnitDownloader(HttpClient client, AsyncManipulator puppetManipulator, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] asyncDownloadFailurePredicates) + : base(client, asyncHtmlTransformer, asyncDownloadFailurePredicates) { + PuppetManipulator = puppetManipulator; + } + + protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { + var page = await PuppetContext.Browser.Value.NewPageAsync(); + try { + await page.GotoAsync(link); + await PuppetManipulator(page); + var download = await page.WaitForDownloadAsync(); + + using var stream = await download.CreateReadStreamAsync(); + byte[] content = new byte[stream.Length]; + + await stream.ReadExactlyAsync(content, ct); + + ByteDocument doc = new ByteDocument(download.SuggestedFilename, content); + if (FailurePredicates is not null && await IsFailure(doc)) + return (false, default); + + var transformed = await Transformer(doc); + return (true, transformed); + } catch (Exception) { + return (false, default); + } finally { + if (!page.IsClosed) + await page.CloseAsync(); + } + } + } + +} diff --git a/Beam.Puppeteer/PuppetedUnitDownloader.cs b/Beam.Puppeteer/PuppetUnitPageDownloader.cs similarity index 54% rename from Beam.Puppeteer/PuppetedUnitDownloader.cs rename to Beam.Puppeteer/PuppetUnitPageDownloader.cs index 2e62992..3667544 100644 --- a/Beam.Puppeteer/PuppetedUnitDownloader.cs +++ b/Beam.Puppeteer/PuppetUnitPageDownloader.cs @@ -3,22 +3,19 @@ using HtmlAgilityPack; using Microsoft.Playwright; namespace Beam.Puppeteer { - public class PuppetContext(IPlaywright playwright, IBrowser browser) { - public IPlaywright Playwright { get; set; } = playwright; - public IBrowser Browser { get; set; } = browser; - } + public class PuppetUnitPageDownloader : UnitDownloader { + public AsyncManipulator PuppetManipulator { get; } - public class PuppetUnitDownloader : UnitDownloader { - public PuppetContext Context { get; } - - public PuppetUnitDownloader(PuppetContext pc, DownloadContext context, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] asyncDownloadFailurePredicates) - : base(context.Web, asyncHtmlTransformer, asyncDownloadFailurePredicates) { - Context = pc; + public PuppetUnitPageDownloader(HtmlWeb web, AsyncManipulator puppetManipulator, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] asyncDownloadFailurePredicates) + : base(web, asyncHtmlTransformer, asyncDownloadFailurePredicates) { + PuppetManipulator = puppetManipulator; } protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { - var page = await Context.Browser.NewPageAsync(); + var page = await PuppetContext.Browser.Value.NewPageAsync(); try { + await page.GotoAsync(link); + await PuppetManipulator(page); var content = await page.ContentAsync(); await page.CloseAsync(); @@ -36,4 +33,5 @@ namespace Beam.Puppeteer { } } } + } diff --git a/Beam.Temporary.Cli/DownloadBuilder.cs b/Beam.Temporary.Cli/DownloadBuilder.cs index 5b30139..def26f8 100644 --- a/Beam.Temporary.Cli/DownloadBuilder.cs +++ b/Beam.Temporary.Cli/DownloadBuilder.cs @@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging; using System; using System.Collections.Generic; using HtmlAgilityPack; +using Beam.Puppeteer; namespace Beam.Temporary.Cli { /// @@ -151,7 +152,8 @@ namespace Beam.Temporary.Cli { private readonly DownloadContextBuilder _ctxBuilder; private readonly AsyncTransformer _transformer; private int _parallelism = 4; - private bool useFragments = false; + private bool _useFragments = false; + private AsyncManipulator? _useManipulator = null; public ContextStage(DownloadContextBuilder ctxBuilder, AsyncTransformer transformer) { _ctxBuilder = ctxBuilder; @@ -178,15 +180,35 @@ namespace Beam.Temporary.Cli { return this; } + /// + /// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with + /// + /// public IContextStage UseFragments() { - useFragments = true; + if (_useManipulator is not null) + _useManipulator = null; + + _useFragments = true; + return this; + } + + /// + /// Use a puppet browser to download the links. This strategy is mutually exclusive with + /// + /// The page manipulator + /// + public IContextStage UsePuppet(AsyncManipulator manipulator) { + if (_useFragments) + _useFragments = false; + + _useManipulator = manipulator; return this; } private object ConstructUnitDownloader(DownloadContext context) { - return (useFragments, _transformer, context.AsyncFailurePredicates) switch { + return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch { // ──────────────── fragmented HTML ──────────────── - (true, AsyncTransformer asyncHtmlTransformer, + (true, _, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] documentFailurePredicates) => new UnitFragmentDownloader( context.Web, @@ -195,7 +217,7 @@ namespace Beam.Temporary.Cli { _parallelism, context.DownloadLogger), // ──────────────── fragmented binary ──────────────── - (true, AsyncTransformer asyncBinaryTransformer, + (true, _, AsyncTransformer asyncBinaryTransformer, AsyncDownloadFailurePredicate[] responseFailurePredicates) => new UnitFragmentDownloaderBinary( context.Client, @@ -204,26 +226,42 @@ namespace Beam.Temporary.Cli { _parallelism, context.DownloadLogger), // ──────────────── single HTML ──────────────── - (false, AsyncTransformer asyncHtmlTransformer, + (false, null, AsyncTransformer asyncHtmlTransformer, AsyncDownloadFailurePredicate[] documentFailurePredicates) => new UnitDownloader( context.Web, asyncHtmlTransformer, documentFailurePredicates), // ──────────────── single binary ──────────────── - (false, AsyncTransformer asyncBinaryTransformer, + (false, null, AsyncTransformer asyncBinaryTransformer, AsyncDownloadFailurePredicate[] responseFailurePredicates) => new UnitDownloaderBinary( context.Client, asyncBinaryTransformer, responseFailurePredicates), - _ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"), + // ──────────────── single puppet binary ──────────────── + (false, AsyncManipulator manipulator, AsyncTransformer asyncBinaryTransformer, + AsyncDownloadFailurePredicate[] responseFailurePredicates) + => new PuppetUnitDownloader( + context.Client, + manipulator, + asyncBinaryTransformer, + responseFailurePredicates), + // ──────────────── single puppet HTML ──────────────── + (false, AsyncManipulator manipulator, AsyncTransformer asyncHtmlTransformer, + AsyncDownloadFailurePredicate[] documentFailurePredicates) + => new PuppetUnitPageDownloader( + context.Web, + manipulator, + asyncHtmlTransformer, + documentFailurePredicates), + _ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"), }; } private IAsyncEnumerator> ConstructDownloader(DownloadContext context) { var copyOfContext = context.CreateBuilder().Build(); - return useFragments switch { + return _useFragments switch { true => new SequentialFragmentDownloader( copyOfContext, ctx => (IUnitDownloader>>)ConstructUnitDownloader(ctx), diff --git a/Beam/UnitDownloaderBinary.cs b/Beam/UnitDownloaderBinary.cs index 8d45471..4248307 100644 --- a/Beam/UnitDownloaderBinary.cs +++ b/Beam/UnitDownloaderBinary.cs @@ -20,13 +20,13 @@ namespace Beam { public int LinksPerDownload { get; } = 1; /// Runs all configured failure predicates in parallel on the raw HTTP response. - protected virtual async Task IsFailure(HttpResponseMessage response) { + protected virtual async Task IsFailure(ByteDocument response) { if (FailurePredicates is null) return false; var failed = false; await Parallel.ForEachAsync(FailurePredicates, async (pred, ct) => { if (failed || pred is null) return; - if (await pred(new ByteDocument(response.RequestMessage?.RequestUri?.AbsolutePath ?? "", await response.Content.ReadAsByteArrayAsync(ct)))) + if (await pred(response)) failed = true; }); return failed; @@ -38,10 +38,11 @@ namespace Beam { using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct); if (!response.IsSuccessStatusCode) return (false, default); - if (await IsFailure(response)) return (false, default); - var bytes = await response.Content.ReadAsByteArrayAsync(ct); - return (true, await Transformer(new ByteDocument(link, bytes))); + var doc = new ByteDocument(link, bytes); + if (await IsFailure(doc)) return (false, default); + + return (true, await Transformer(doc)); } catch { return (false, default); }