feat: add Puppeteer integration for web downloads
This introduces a new Puppeteer-based mechanism for downloading web content. It provides a flexible way to manipulate pages during downloads, enhancing the ability to handle dynamic content and improve the overall download process.
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace Beam.Puppeteer {
|
||||
public delegate Task AsyncManipulator(IPage page);
|
||||
|
||||
public static class PuppetContext {
|
||||
public static Lazy<IPlaywright> Playwright { get; set; }
|
||||
public static Lazy<IBrowser> Browser { get; set; }
|
||||
|
||||
static PuppetContext() {
|
||||
Playwright = new Lazy<IPlaywright>(() => Microsoft.Playwright.Playwright.CreateAsync().Result);
|
||||
Browser = new Lazy<IBrowser>(() => Playwright.Value.Chromium.LaunchAsync().Result);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace Beam.Puppeteer {
|
||||
public class PuppetUnitDownloader<T> : UnitDownloaderBinary<T> {
|
||||
public AsyncManipulator PuppetManipulator { get; }
|
||||
|
||||
public PuppetUnitDownloader(HttpClient client, AsyncManipulator puppetManipulator, AsyncTransformer<ByteDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<ByteDocument>[] asyncDownloadFailurePredicates)
|
||||
: base(client, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||
PuppetManipulator = puppetManipulator;
|
||||
}
|
||||
|
||||
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||
var page = await PuppetContext.Browser.Value.NewPageAsync();
|
||||
try {
|
||||
await page.GotoAsync(link);
|
||||
await PuppetManipulator(page);
|
||||
var download = await page.WaitForDownloadAsync();
|
||||
|
||||
using var stream = await download.CreateReadStreamAsync();
|
||||
byte[] content = new byte[stream.Length];
|
||||
|
||||
await stream.ReadExactlyAsync(content, ct);
|
||||
|
||||
ByteDocument doc = new ByteDocument(download.SuggestedFilename, content);
|
||||
if (FailurePredicates is not null && await IsFailure(doc))
|
||||
return (false, default);
|
||||
|
||||
var transformed = await Transformer(doc);
|
||||
return (true, transformed);
|
||||
} catch (Exception) {
|
||||
return (false, default);
|
||||
} finally {
|
||||
if (!page.IsClosed)
|
||||
await page.CloseAsync();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
+9
-11
@@ -3,22 +3,19 @@ using HtmlAgilityPack;
|
||||
using Microsoft.Playwright;
|
||||
|
||||
namespace Beam.Puppeteer {
|
||||
public class PuppetContext(IPlaywright playwright, IBrowser browser) {
|
||||
public IPlaywright Playwright { get; set; } = playwright;
|
||||
public IBrowser Browser { get; set; } = browser;
|
||||
}
|
||||
public class PuppetUnitPageDownloader<T> : UnitDownloader<T> {
|
||||
public AsyncManipulator PuppetManipulator { get; }
|
||||
|
||||
public class PuppetUnitDownloader<T> : UnitDownloader<T> {
|
||||
public PuppetContext Context { get; }
|
||||
|
||||
public PuppetUnitDownloader(PuppetContext pc, DownloadContext<T> context, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
|
||||
: base(context.Web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||
Context = pc;
|
||||
public PuppetUnitPageDownloader(HtmlWeb web, AsyncManipulator puppetManipulator, AsyncTransformer<HtmlDocument, T> asyncHtmlTransformer, AsyncDownloadFailurePredicate<HtmlDocument>[] asyncDownloadFailurePredicates)
|
||||
: base(web, asyncHtmlTransformer, asyncDownloadFailurePredicates) {
|
||||
PuppetManipulator = puppetManipulator;
|
||||
}
|
||||
|
||||
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
|
||||
var page = await Context.Browser.NewPageAsync();
|
||||
var page = await PuppetContext.Browser.Value.NewPageAsync();
|
||||
try {
|
||||
await page.GotoAsync(link);
|
||||
await PuppetManipulator(page);
|
||||
var content = await page.ContentAsync();
|
||||
await page.CloseAsync();
|
||||
|
||||
@@ -36,4 +33,5 @@ namespace Beam.Puppeteer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user