Files
Beam/Beam.Temporary.Cli/DownloadBuilder.cs
T
qwsdcvghyu89 3baa31a7cc feat: add Puppeteer integration for web downloads
This introduces a new Puppeteer-based mechanism for downloading
web content. It provides a flexible way to manipulate pages
during downloads, enhancing the ability to handle dynamic
content and improve the overall download process.
2025-06-25 13:42:24 +03:00

284 lines
14 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using aeqw89.DataKeys;
using Beam.Dynamic;
using Beam;
using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using Beam.Puppeteer;
namespace Beam.Temporary.Cli {
/// <summary>
/// Typesafe, staged builder that prevents callers from forgetting the mandatory steps
/// (source → link selection → transformer) and surfaces operational knobs as firstclass
/// methods instead of magic parameters.
/// </summary>
public static class DownloadBuilder<RawType, OutType> {
/* ──────────────────────────── Entry points ─────────────────────────── */
public static ILinkStage FromMeta(DataKey<TextResource> novelKey, BeamDataDictionary data) =>
Create(novelKey, data, SourceKind.Meta);
public static ILinkStage FromText(DataKey<TextResource> novelKey, BeamDataDictionary data) =>
Create(novelKey, data, SourceKind.Text);
public static IAlternativeLinkStage FromScratch()
=> new LinkStage(null!, null!, null!, new());
/* ────────────────────────────── Stages ─────────────────────────────── */
public interface ILinkStage {
ITransformStage WithLink();
ITransformStage WithLinkGenerator();
ILinkStage WithRange(Range range);
}
public interface IAlternativeLinkStage {
IAlternativeTransformStage WithLinks(IEnumerable<SourceLink> links);
}
public interface ITransformStage {
IContextStage WithTransformer(Func<DataBindings, AsyncTransformer<RawType, OutType>> factory);
}
public interface IAlternativeTransformStage {
IContextStage WithTransformer(AsyncTransformer<RawType, OutType> transformer);
IContextStage WithTransformer(Func<RawType, OutType> transformer) {
return WithTransformer(rt => Task.FromResult(transformer(rt)));
}
}
public interface IContextStage {
IContextStage Configure(Action<DownloadContextBuilder<RawType>> configure);
IContextStage WithParallelism(int degree);
IContextStage WithTimeout(TimeSpan timeout);
IContextStage WithRetryReporter(IProgress<RetryReport> reporter);
DownloadEnumerable<OutType> Build();
IContextStage UseFragments();
}
/* ────────────────────────── Implementation ────────────────────────── */
private enum SourceKind { Meta, Text }
private static ILinkStage Create(DataKey<TextResource> novelKey, BeamDataDictionary data, SourceKind kind) {
var (source, initial) = Resolve(novelKey, data, kind);
var ctxBuilder = new DownloadContextBuilder<RawType>().WithLinks(Array.Empty<SourceLink>()); // placeholder, filled later.
return new LinkStage(source, initial, data, ctxBuilder);
}
private static (WebResource Source, State Initial) Resolve(DataKey<TextResource> novelKey, BeamDataDictionary data, SourceKind kind) {
if (!data.Novels.TryGetValue(novelKey, out var tr))
throw new KeyNotFoundException($"Novel '{novelKey}' not found in BeamDataDictionary.");
var textRecord = tr.ToRecord(data);
WebResource? source;
State? initial;
if (kind == SourceKind.Meta) {
source = textRecord.AssociatedMetaSource ?? throw new InvalidOperationException($"Meta source missing for '{novelKey}'.");
initial = textRecord.Resource.MetaTemplateInitialData ?? throw new InvalidOperationException("Meta template data missing.");
} else {
source = textRecord.AssociatedSource ?? throw new InvalidOperationException($"Text source missing for '{novelKey}'.");
initial = textRecord.Resource.TemplateInitialData;
}
return (source, initial);
}
/* ──────────────────────────── Stage types ─────────────────────────── */
private sealed record LinkStage(
WebResource Source,
State Initial,
BeamDataDictionary Data,
DownloadContextBuilder<RawType> CtxBuilder) : ILinkStage, IAlternativeLinkStage {
private State? endState;
private bool linksFrozen = false;
public ITransformStage WithLink() {
var link = Data.Templates[Source.Key].Builder.Build(Initial);
CtxBuilder.WithLinks(new[] { link });
return new TransformStage(Source, Data, CtxBuilder);
}
public ITransformStage WithLinkGenerator() {
var template = Data.Templates[Source.Key];
var generator = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator(
template.Builder,
new NumberedStateChanger(template.Factory.Behavior),
Initial, endState));
CtxBuilder.WithLinks(generator);
linksFrozen = true;
return new TransformStage(Source, Data, CtxBuilder);
}
public IAlternativeTransformStage WithLinks(IEnumerable<SourceLink> links) {
CtxBuilder.WithLinks(links);
return new TransformStage(Source, Data, CtxBuilder);
}
public ILinkStage WithRange(Range range) {
if (linksFrozen)
throw new InvalidOperationException($"WithRange must be called before WithLinkGenerator");
if (range.End.Value < range.Start.Value)
throw new ArgumentOutOfRangeException(nameof(range), $" start must be < end");
var template = Data.Templates[Source.Key];
var stateChanger = new NumberedStateChanger(template.Factory.Behavior);
endState = Initial.Copy();
stateChanger.Apply(Initial, range.Start.Value - 1);
stateChanger.Apply(endState, range.End.Value - 1);
return this;
}
}
private sealed record TransformStage(
WebResource Source,
BeamDataDictionary Data,
DownloadContextBuilder<RawType> CtxBuilder) : ITransformStage, IAlternativeTransformStage {
public IContextStage WithTransformer(Func<DataBindings, AsyncTransformer<RawType, OutType>> factory) {
var transformer = factory(Data.Bindings[Source.Bindings]);
return new ContextStage(CtxBuilder, transformer);
}
public IContextStage WithTransformer(AsyncTransformer<RawType, OutType> transformer) {
return new ContextStage(CtxBuilder, transformer);
}
}
private sealed class ContextStage : IContextStage {
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
private readonly AsyncTransformer<RawType, OutType> _transformer;
private int _parallelism = 4;
private bool _useFragments = false;
private AsyncManipulator? _useManipulator = null;
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
_ctxBuilder = ctxBuilder;
_transformer = transformer;
}
public IContextStage Configure(Action<DownloadContextBuilder<RawType>> configure) {
configure(_ctxBuilder);
return this;
}
public IContextStage WithParallelism(int degree) {
_parallelism = Math.Max(1, degree);
return this;
}
public IContextStage WithTimeout(TimeSpan timeout) {
_ctxBuilder.WithTimeOut(timeout);
return this;
}
public IContextStage WithRetryReporter(IProgress<RetryReport> reporter) {
_ctxBuilder.WithRetryReporter(reporter);
return this;
}
/// <summary>
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
/// </summary>
/// <returns></returns>
public IContextStage UseFragments() {
if (_useManipulator is not null)
_useManipulator = null;
_useFragments = true;
return this;
}
/// <summary>
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
/// </summary>
/// <param name="manipulator">The page manipulator</param>
/// <returns></returns>
public IContextStage UsePuppet(AsyncManipulator manipulator) {
if (_useFragments)
_useFragments = false;
_useManipulator = manipulator;
return this;
}
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
// ──────────────── fragmented HTML ────────────────
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new UnitFragmentDownloader<OutType>(
context.Web,
asyncHtmlTransformer,
documentFailurePredicates,
_parallelism,
context.DownloadLogger),
// ──────────────── fragmented binary ────────────────
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new UnitFragmentDownloaderBinary<OutType>(
context.Client,
asyncBinaryTransformer,
responseFailurePredicates,
_parallelism,
context.DownloadLogger),
// ──────────────── single HTML ────────────────
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new UnitDownloader<OutType>(
context.Web,
asyncHtmlTransformer,
documentFailurePredicates),
// ──────────────── single binary ────────────────
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new UnitDownloaderBinary<OutType>(
context.Client,
asyncBinaryTransformer,
responseFailurePredicates),
// ──────────────── single puppet binary ────────────────
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
=> new PuppetUnitDownloader<OutType>(
context.Client,
manipulator,
asyncBinaryTransformer,
responseFailurePredicates),
// ──────────────── single puppet HTML ────────────────
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
=> new PuppetUnitPageDownloader<OutType>(
context.Web,
manipulator,
asyncHtmlTransformer,
documentFailurePredicates),
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
};
}
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
var copyOfContext = context.CreateBuilder().Build();
return _useFragments switch {
true => new SequentialFragmentDownloader<RawType, OutType>(
copyOfContext,
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
context.DownloadLogger).UnwrapFragmented(),
false => new SequentialDownloader<RawType, OutType>(
copyOfContext,
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
context.DownloadLogger).WrapOrdered()
};
}
public DownloadEnumerable<OutType> Build() {
var context = _ctxBuilder.Build();
var enumerable = new DownloadEnumerable<OutType>(ConstructDownloader(context));
return enumerable;
}
}
}
}