feat: add Puppeteer integration for web downloads
This introduces a new Puppeteer-based mechanism for downloading web content. It provides a flexible way to manipulate pages during downloads, enhancing the ability to handle dynamic content and improve the overall download process.
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace Beam.Puppeteer {
|
||||
public delegate Task AsyncManipulator(IPage page);
|
||||
|
||||
public static class PuppetContext {
|
||||
public static Lazy<IPlaywright> Playwright { get; set; }
|
||||
public static Lazy<IBrowser> Browser { get; set; }
|
||||
|
||||
static PuppetContext() {
|
||||
Playwright = new Lazy<IPlaywright>(() => Microsoft.Playwright.Playwright.CreateAsync().Result);
|
||||
Browser = new Lazy<IBrowser>(() => Playwright.Value.Chromium.LaunchAsync().Result);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace Beam.Puppeteer {
|
||||
public class PuppetUnitDownloader<T> : UnitDownloaderBinary<T> {
|
||||
public AsyncManipulator PuppetManipulator { get; }
|
||||
|
||||
public PuppetUnitDownloader(HttpClient client, AsyncManipulator puppetManipulator, AsyncTransformer<ByteDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<ByteDocument>[] asyncDownloadFailurePredicates)
|
||||
: base(client, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||
PuppetManipulator = puppetManipulator;
|
||||
}
|
||||
|
||||
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||
var page = await PuppetContext.Browser.Value.NewPageAsync();
|
||||
try {
|
||||
await page.GotoAsync(link);
|
||||
await PuppetManipulator(page);
|
||||
var download = await page.WaitForDownloadAsync();
|
||||
|
||||
using var stream = await download.CreateReadStreamAsync();
|
||||
byte[] content = new byte[stream.Length];
|
||||
|
||||
await stream.ReadExactlyAsync(content, ct);
|
||||
|
||||
ByteDocument doc = new ByteDocument(download.SuggestedFilename, content);
|
||||
if (FailurePredicates is not null && await IsFailure(doc))
|
||||
return (false, default);
|
||||
|
||||
var transformed = await Transformer(doc);
|
||||
return (true, transformed);
|
||||
} catch (Exception) {
|
||||
return (false, default);
|
||||
} finally {
|
||||
if (!page.IsClosed)
|
||||
await page.CloseAsync();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
+9
-11
@@ -3,22 +3,19 @@ using HtmlAgilityPack;
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace Beam.Puppeteer {
|
||||
public class PuppetContext(IPlaywright playwright, IBrowser browser) {
|
||||
public IPlaywright Playwright { get; set; } = playwright;
|
||||
public IBrowser Browser { get; set; } = browser;
|
||||
}
|
||||
public class PuppetUnitPageDownloader<T> : UnitDownloader<T> {
|
||||
public AsyncManipulator PuppetManipulator { get; }
|
||||
|
||||
public class PuppetUnitDownloader<T> : UnitDownloader<T> {
|
||||
public PuppetContext Context { get; }
|
||||
|
||||
public PuppetUnitDownloader(PuppetContext pc, DownloadContext<T> context, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
|
||||
: base(context.Web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||
Context = pc;
|
||||
public PuppetUnitPageDownloader(HtmlWeb web, AsyncManipulator puppetManipulator, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
|
||||
: base(web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||
PuppetManipulator = puppetManipulator;
|
||||
}
|
||||
|
||||
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||
var page = await Context.Browser.NewPageAsync();
|
||||
var page = await PuppetContext.Browser.Value.NewPageAsync();
|
||||
try {
|
||||
await page.GotoAsync(link);
|
||||
await PuppetManipulator(page);
|
||||
var content = await page.ContentAsync();
|
||||
await page.CloseAsync();
|
||||
|
||||
@@ -36,4 +33,5 @@ namespace Beam.Puppeteer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using HtmlAgilityPack;
|
||||
using Beam.Puppeteer;
|
||||
|
||||
namespace Beam.Temporary.Cli {
|
||||
/// <summary>
|
||||
@@ -151,7 +152,8 @@ namespace Beam.Temporary.Cli {
|
||||
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
||||
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
||||
private int _parallelism = 4;
|
||||
private bool useFragments = false;
|
||||
private bool _useFragments = false;
|
||||
private AsyncManipulator? _useManipulator = null;
|
||||
|
||||
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
|
||||
_ctxBuilder = ctxBuilder;
|
||||
@@ -178,15 +180,35 @@ namespace Beam.Temporary.Cli {
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public IContextStage UseFragments() {
|
||||
useFragments = true;
|
||||
if (_useManipulator is not null)
|
||||
_useManipulator = null;
|
||||
|
||||
_useFragments = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
||||
/// </summary>
|
||||
/// <param name="manipulator">The page manipulator</param>
|
||||
/// <returns></returns>
|
||||
public IContextStage UsePuppet(AsyncManipulator manipulator) {
|
||||
if (_useFragments)
|
||||
_useFragments = false;
|
||||
|
||||
_useManipulator = manipulator;
|
||||
return this;
|
||||
}
|
||||
|
||||
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
||||
return (useFragments, _transformer, context.AsyncFailurePredicates) switch {
|
||||
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
|
||||
// ──────────────── fragmented HTML ────────────────
|
||||
(true, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||
=> new UnitFragmentDownloader<OutType>(
|
||||
context.Web,
|
||||
@@ -195,7 +217,7 @@ namespace Beam.Temporary.Cli {
|
||||
_parallelism,
|
||||
context.DownloadLogger),
|
||||
// ──────────────── fragmented binary ────────────────
|
||||
(true, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||
=> new UnitFragmentDownloaderBinary<OutType>(
|
||||
context.Client,
|
||||
@@ -204,26 +226,42 @@ namespace Beam.Temporary.Cli {
|
||||
_parallelism,
|
||||
context.DownloadLogger),
|
||||
// ──────────────── single HTML ────────────────
|
||||
(false, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||
=> new UnitDownloader<OutType>(
|
||||
context.Web,
|
||||
asyncHtmlTransformer,
|
||||
documentFailurePredicates),
|
||||
// ──────────────── single binary ────────────────
|
||||
(false, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||
=> new UnitDownloaderBinary<OutType>(
|
||||
context.Client,
|
||||
asyncBinaryTransformer,
|
||||
responseFailurePredicates),
|
||||
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
||||
// ──────────────── single puppet binary ────────────────
|
||||
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||
=> new PuppetUnitDownloader<OutType>(
|
||||
context.Client,
|
||||
manipulator,
|
||||
asyncBinaryTransformer,
|
||||
responseFailurePredicates),
|
||||
// ──────────────── single puppet HTML ────────────────
|
||||
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||
=> new PuppetUnitPageDownloader<OutType>(
|
||||
context.Web,
|
||||
manipulator,
|
||||
asyncHtmlTransformer,
|
||||
documentFailurePredicates),
|
||||
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
||||
};
|
||||
}
|
||||
|
||||
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
||||
var copyOfContext = context.CreateBuilder().Build();
|
||||
return useFragments switch {
|
||||
return _useFragments switch {
|
||||
true => new SequentialFragmentDownloader<RawType, OutType>(
|
||||
copyOfContext,
|
||||
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
||||
|
||||
@@ -20,13 +20,13 @@ namespace Beam {
|
||||
public int LinksPerDownload { get; } = 1;
|
||||
|
||||
/// <summary>Runs all configured failure predicates in parallel on the raw HTTP response.</summary>
|
||||
protected virtual async Task<bool> IsFailure(HttpResponseMessage response) {
|
||||
protected virtual async Task<bool> IsFailure(ByteDocument response) {
|
||||
if (FailurePredicates is null) return false;
|
||||
|
||||
var failed = false;
|
||||
await Parallel.ForEachAsync(FailurePredicates, async (pred, ct) => {
|
||||
if (failed || pred is null) return;
|
||||
if (await pred(new ByteDocument(response.RequestMessage?.RequestUri?.AbsolutePath ?? "", await response.Content.ReadAsByteArrayAsync(ct))))
|
||||
if (await pred(response))
|
||||
failed = true;
|
||||
});
|
||||
return failed;
|
||||
@@ -38,10 +38,11 @@ namespace Beam {
|
||||
using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct);
|
||||
if (!response.IsSuccessStatusCode) return (false, default);
|
||||
|
||||
if (await IsFailure(response)) return (false, default);
|
||||
|
||||
var bytes = await response.Content.ReadAsByteArrayAsync(ct);
|
||||
return (true, await Transformer(new ByteDocument(link, bytes)));
|
||||
var doc = new ByteDocument(link, bytes);
|
||||
if (await IsFailure(doc)) return (false, default);
|
||||
|
||||
return (true, await Transformer(doc));
|
||||
} catch {
|
||||
return (false, default);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user