feat: add Puppeteer integration for web downloads
This introduces a new Puppeteer-based mechanism for downloading web content. It provides a flexible way to manipulate pages during downloads, enhancing the ability to handle dynamic content and improve the overall download process.
This commit is contained in:
@@ -0,0 +1,16 @@
|
|||||||
|
using Microsoft.Playwright;
|
||||||
|
|
||||||
|
namespace Beam.Puppeteer {
|
||||||
|
public delegate Task AsyncManipulator(IPage page);
|
||||||
|
|
||||||
|
public static class PuppetContext {
|
||||||
|
public static Lazy<IPlaywright> Playwright { get; set; }
|
||||||
|
public static Lazy<IBrowser> Browser { get; set; }
|
||||||
|
|
||||||
|
static PuppetContext() {
|
||||||
|
Playwright = new Lazy<IPlaywright>(() => Microsoft.Playwright.Playwright.CreateAsync().Result);
|
||||||
|
Browser = new Lazy<IBrowser>(() => Playwright.Value.Chromium.LaunchAsync().Result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
using Microsoft.Playwright;
|
||||||
|
|
||||||
|
namespace Beam.Puppeteer {
|
||||||
|
public class PuppetUnitDownloader<T> : UnitDownloaderBinary<T> {
|
||||||
|
public AsyncManipulator PuppetManipulator { get; }
|
||||||
|
|
||||||
|
public PuppetUnitDownloader(HttpClient client, AsyncManipulator puppetManipulator, AsyncTransformer<ByteDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<ByteDocument>[] asyncDownloadFailurePredicates)
|
||||||
|
: base(client, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||||
|
PuppetManipulator = puppetManipulator;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||||
|
var page = await PuppetContext.Browser.Value.NewPageAsync();
|
||||||
|
try {
|
||||||
|
await page.GotoAsync(link);
|
||||||
|
await PuppetManipulator(page);
|
||||||
|
var download = await page.WaitForDownloadAsync();
|
||||||
|
|
||||||
|
using var stream = await download.CreateReadStreamAsync();
|
||||||
|
byte[] content = new byte[stream.Length];
|
||||||
|
|
||||||
|
await stream.ReadExactlyAsync(content, ct);
|
||||||
|
|
||||||
|
ByteDocument doc = new ByteDocument(download.SuggestedFilename, content);
|
||||||
|
if (FailurePredicates is not null && await IsFailure(doc))
|
||||||
|
return (false, default);
|
||||||
|
|
||||||
|
var transformed = await Transformer(doc);
|
||||||
|
return (true, transformed);
|
||||||
|
} catch (Exception) {
|
||||||
|
return (false, default);
|
||||||
|
} finally {
|
||||||
|
if (!page.IsClosed)
|
||||||
|
await page.CloseAsync();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
+9
-11
@@ -3,22 +3,19 @@ using HtmlAgilityPack;
|
|||||||
using Microsoft.Playwright;
|
using Microsoft.Playwright;
|
||||||
|
|
||||||
namespace Beam.Puppeteer {
|
namespace Beam.Puppeteer {
|
||||||
public class PuppetContext(IPlaywright playwright, IBrowser browser) {
|
public class PuppetUnitPageDownloader<T> : UnitDownloader<T> {
|
||||||
public IPlaywright Playwright { get; set; } = playwright;
|
public AsyncManipulator PuppetManipulator { get; }
|
||||||
public IBrowser Browser { get; set; } = browser;
|
|
||||||
}
|
|
||||||
|
|
||||||
public class PuppetUnitDownloader<T> : UnitDownloader<T> {
|
public PuppetUnitPageDownloader(HtmlWeb web, AsyncManipulator puppetManipulator, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
|
||||||
public PuppetContext Context { get; }
|
: base(web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||||
|
PuppetManipulator = puppetManipulator;
|
||||||
public PuppetUnitDownloader(PuppetContext pc, DownloadContext<T> context, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
|
|
||||||
: base(context.Web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
|
||||||
Context = pc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||||
var page = await Context.Browser.NewPageAsync();
|
var page = await PuppetContext.Browser.Value.NewPageAsync();
|
||||||
try {
|
try {
|
||||||
|
await page.GotoAsync(link);
|
||||||
|
await PuppetManipulator(page);
|
||||||
var content = await page.ContentAsync();
|
var content = await page.ContentAsync();
|
||||||
await page.CloseAsync();
|
await page.CloseAsync();
|
||||||
|
|
||||||
@@ -36,4 +33,5 @@ namespace Beam.Puppeteer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -5,6 +5,7 @@ using Microsoft.Extensions.Logging;
|
|||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using HtmlAgilityPack;
|
using HtmlAgilityPack;
|
||||||
|
using Beam.Puppeteer;
|
||||||
|
|
||||||
namespace Beam.Temporary.Cli {
|
namespace Beam.Temporary.Cli {
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -151,7 +152,8 @@ namespace Beam.Temporary.Cli {
|
|||||||
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
||||||
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
||||||
private int _parallelism = 4;
|
private int _parallelism = 4;
|
||||||
private bool useFragments = false;
|
private bool _useFragments = false;
|
||||||
|
private AsyncManipulator? _useManipulator = null;
|
||||||
|
|
||||||
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
|
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
|
||||||
_ctxBuilder = ctxBuilder;
|
_ctxBuilder = ctxBuilder;
|
||||||
@@ -178,15 +180,35 @@ namespace Beam.Temporary.Cli {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
|
||||||
|
/// </summary>
|
||||||
|
/// <returns></returns>
|
||||||
public IContextStage UseFragments() {
|
public IContextStage UseFragments() {
|
||||||
useFragments = true;
|
if (_useManipulator is not null)
|
||||||
|
_useManipulator = null;
|
||||||
|
|
||||||
|
_useFragments = true;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="manipulator">The page manipulator</param>
|
||||||
|
/// <returns></returns>
|
||||||
|
public IContextStage UsePuppet(AsyncManipulator manipulator) {
|
||||||
|
if (_useFragments)
|
||||||
|
_useFragments = false;
|
||||||
|
|
||||||
|
_useManipulator = manipulator;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
||||||
return (useFragments, _transformer, context.AsyncFailurePredicates) switch {
|
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
|
||||||
// ──────────────── fragmented HTML ────────────────
|
// ──────────────── fragmented HTML ────────────────
|
||||||
(true, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||||
=> new UnitFragmentDownloader<OutType>(
|
=> new UnitFragmentDownloader<OutType>(
|
||||||
context.Web,
|
context.Web,
|
||||||
@@ -195,7 +217,7 @@ namespace Beam.Temporary.Cli {
|
|||||||
_parallelism,
|
_parallelism,
|
||||||
context.DownloadLogger),
|
context.DownloadLogger),
|
||||||
// ──────────────── fragmented binary ────────────────
|
// ──────────────── fragmented binary ────────────────
|
||||||
(true, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||||
=> new UnitFragmentDownloaderBinary<OutType>(
|
=> new UnitFragmentDownloaderBinary<OutType>(
|
||||||
context.Client,
|
context.Client,
|
||||||
@@ -204,26 +226,42 @@ namespace Beam.Temporary.Cli {
|
|||||||
_parallelism,
|
_parallelism,
|
||||||
context.DownloadLogger),
|
context.DownloadLogger),
|
||||||
// ──────────────── single HTML ────────────────
|
// ──────────────── single HTML ────────────────
|
||||||
(false, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||||
=> new UnitDownloader<OutType>(
|
=> new UnitDownloader<OutType>(
|
||||||
context.Web,
|
context.Web,
|
||||||
asyncHtmlTransformer,
|
asyncHtmlTransformer,
|
||||||
documentFailurePredicates),
|
documentFailurePredicates),
|
||||||
// ──────────────── single binary ────────────────
|
// ──────────────── single binary ────────────────
|
||||||
(false, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||||
=> new UnitDownloaderBinary<OutType>(
|
=> new UnitDownloaderBinary<OutType>(
|
||||||
context.Client,
|
context.Client,
|
||||||
asyncBinaryTransformer,
|
asyncBinaryTransformer,
|
||||||
responseFailurePredicates),
|
responseFailurePredicates),
|
||||||
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
// ──────────────── single puppet binary ────────────────
|
||||||
|
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||||||
|
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||||||
|
=> new PuppetUnitDownloader<OutType>(
|
||||||
|
context.Client,
|
||||||
|
manipulator,
|
||||||
|
asyncBinaryTransformer,
|
||||||
|
responseFailurePredicates),
|
||||||
|
// ──────────────── single puppet HTML ────────────────
|
||||||
|
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||||||
|
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||||||
|
=> new PuppetUnitPageDownloader<OutType>(
|
||||||
|
context.Web,
|
||||||
|
manipulator,
|
||||||
|
asyncHtmlTransformer,
|
||||||
|
documentFailurePredicates),
|
||||||
|
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
||||||
var copyOfContext = context.CreateBuilder().Build();
|
var copyOfContext = context.CreateBuilder().Build();
|
||||||
return useFragments switch {
|
return _useFragments switch {
|
||||||
true => new SequentialFragmentDownloader<RawType, OutType>(
|
true => new SequentialFragmentDownloader<RawType, OutType>(
|
||||||
copyOfContext,
|
copyOfContext,
|
||||||
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
||||||
|
|||||||
@@ -20,13 +20,13 @@ namespace Beam {
|
|||||||
public int LinksPerDownload { get; } = 1;
|
public int LinksPerDownload { get; } = 1;
|
||||||
|
|
||||||
/// <summary>Runs all configured failure predicates in parallel on the raw HTTP response.</summary>
|
/// <summary>Runs all configured failure predicates in parallel on the raw HTTP response.</summary>
|
||||||
protected virtual async Task<bool> IsFailure(HttpResponseMessage response) {
|
protected virtual async Task<bool> IsFailure(ByteDocument response) {
|
||||||
if (FailurePredicates is null) return false;
|
if (FailurePredicates is null) return false;
|
||||||
|
|
||||||
var failed = false;
|
var failed = false;
|
||||||
await Parallel.ForEachAsync(FailurePredicates, async (pred, ct) => {
|
await Parallel.ForEachAsync(FailurePredicates, async (pred, ct) => {
|
||||||
if (failed || pred is null) return;
|
if (failed || pred is null) return;
|
||||||
if (await pred(new ByteDocument(response.RequestMessage?.RequestUri?.AbsolutePath ?? "", await response.Content.ReadAsByteArrayAsync(ct))))
|
if (await pred(response))
|
||||||
failed = true;
|
failed = true;
|
||||||
});
|
});
|
||||||
return failed;
|
return failed;
|
||||||
@@ -38,10 +38,11 @@ namespace Beam {
|
|||||||
using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct);
|
using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct);
|
||||||
if (!response.IsSuccessStatusCode) return (false, default);
|
if (!response.IsSuccessStatusCode) return (false, default);
|
||||||
|
|
||||||
if (await IsFailure(response)) return (false, default);
|
|
||||||
|
|
||||||
var bytes = await response.Content.ReadAsByteArrayAsync(ct);
|
var bytes = await response.Content.ReadAsByteArrayAsync(ct);
|
||||||
return (true, await Transformer(new ByteDocument(link, bytes)));
|
var doc = new ByteDocument(link, bytes);
|
||||||
|
if (await IsFailure(doc)) return (false, default);
|
||||||
|
|
||||||
|
return (true, await Transformer(doc));
|
||||||
} catch {
|
} catch {
|
||||||
return (false, default);
|
return (false, default);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user