3baa31a7cc
This introduces a new Puppeteer-based mechanism for downloading web content. It provides a flexible way to manipulate pages during downloads, enhancing the ability to handle dynamic content and improve the overall download process.
284 lines
14 KiB
C#
284 lines
14 KiB
C#
using aeqw89.DataKeys;
|
||
using Beam.Dynamic;
|
||
using Beam;
|
||
using Microsoft.Extensions.Logging;
|
||
using System;
|
||
using System.Collections.Generic;
|
||
using HtmlAgilityPack;
|
||
using Beam.Puppeteer;
|
||
|
||
namespace Beam.Temporary.Cli {
|
||
/// <summary>
|
||
/// Type‑safe, staged builder that prevents callers from forgetting the mandatory steps
|
||
/// (source → link selection → transformer) and surfaces operational knobs as first‑class
|
||
/// methods instead of magic parameters.
|
||
/// </summary>
|
||
public static class DownloadBuilder<RawType, OutType> {
|
||
/* ──────────────────────────── Entry points ─────────────────────────── */
|
||
|
||
public static ILinkStage FromMeta(DataKey<TextResource> novelKey, BeamDataDictionary data) =>
|
||
Create(novelKey, data, SourceKind.Meta);
|
||
|
||
public static ILinkStage FromText(DataKey<TextResource> novelKey, BeamDataDictionary data) =>
|
||
Create(novelKey, data, SourceKind.Text);
|
||
|
||
public static IAlternativeLinkStage FromScratch()
|
||
=> new LinkStage(null!, null!, null!, new());
|
||
|
||
/* ────────────────────────────── Stages ─────────────────────────────── */
|
||
|
||
public interface ILinkStage {
|
||
ITransformStage WithLink();
|
||
ITransformStage WithLinkGenerator();
|
||
ILinkStage WithRange(Range range);
|
||
}
|
||
|
||
public interface IAlternativeLinkStage {
|
||
IAlternativeTransformStage WithLinks(IEnumerable<SourceLink> links);
|
||
}
|
||
|
||
public interface ITransformStage {
|
||
IContextStage WithTransformer(Func<DataBindings, AsyncTransformer<RawType, OutType>> factory);
|
||
}
|
||
|
||
public interface IAlternativeTransformStage {
|
||
IContextStage WithTransformer(AsyncTransformer<RawType, OutType> transformer);
|
||
IContextStage WithTransformer(Func<RawType, OutType> transformer) {
|
||
return WithTransformer(rt => Task.FromResult(transformer(rt)));
|
||
}
|
||
}
|
||
|
||
public interface IContextStage {
|
||
IContextStage Configure(Action<DownloadContextBuilder<RawType>> configure);
|
||
IContextStage WithParallelism(int degree);
|
||
IContextStage WithTimeout(TimeSpan timeout);
|
||
IContextStage WithRetryReporter(IProgress<RetryReport> reporter);
|
||
DownloadEnumerable<OutType> Build();
|
||
IContextStage UseFragments();
|
||
}
|
||
|
||
/* ────────────────────────── Implementation ────────────────────────── */
|
||
|
||
private enum SourceKind { Meta, Text }
|
||
|
||
private static ILinkStage Create(DataKey<TextResource> novelKey, BeamDataDictionary data, SourceKind kind) {
|
||
var (source, initial) = Resolve(novelKey, data, kind);
|
||
var ctxBuilder = new DownloadContextBuilder<RawType>().WithLinks(Array.Empty<SourceLink>()); // placeholder, filled later.
|
||
return new LinkStage(source, initial, data, ctxBuilder);
|
||
}
|
||
|
||
private static (WebResource Source, State Initial) Resolve(DataKey<TextResource> novelKey, BeamDataDictionary data, SourceKind kind) {
|
||
if (!data.Novels.TryGetValue(novelKey, out var tr))
|
||
throw new KeyNotFoundException($"Novel '{novelKey}' not found in BeamDataDictionary.");
|
||
|
||
var textRecord = tr.ToRecord(data);
|
||
WebResource? source;
|
||
State? initial;
|
||
|
||
if (kind == SourceKind.Meta) {
|
||
source = textRecord.AssociatedMetaSource ?? throw new InvalidOperationException($"Meta source missing for '{novelKey}'.");
|
||
initial = textRecord.Resource.MetaTemplateInitialData ?? throw new InvalidOperationException("Meta template data missing.");
|
||
} else {
|
||
source = textRecord.AssociatedSource ?? throw new InvalidOperationException($"Text source missing for '{novelKey}'.");
|
||
initial = textRecord.Resource.TemplateInitialData;
|
||
}
|
||
|
||
return (source, initial);
|
||
}
|
||
|
||
/* ──────────────────────────── Stage types ─────────────────────────── */
|
||
|
||
|
||
private sealed record LinkStage(
|
||
WebResource Source,
|
||
State Initial,
|
||
BeamDataDictionary Data,
|
||
DownloadContextBuilder<RawType> CtxBuilder) : ILinkStage, IAlternativeLinkStage {
|
||
|
||
private State? endState;
|
||
private bool linksFrozen = false;
|
||
|
||
public ITransformStage WithLink() {
|
||
var link = Data.Templates[Source.Key].Builder.Build(Initial);
|
||
CtxBuilder.WithLinks(new[] { link });
|
||
return new TransformStage(Source, Data, CtxBuilder);
|
||
}
|
||
|
||
public ITransformStage WithLinkGenerator() {
|
||
var template = Data.Templates[Source.Key];
|
||
var generator = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator(
|
||
template.Builder,
|
||
new NumberedStateChanger(template.Factory.Behavior),
|
||
Initial, endState));
|
||
CtxBuilder.WithLinks(generator);
|
||
linksFrozen = true;
|
||
return new TransformStage(Source, Data, CtxBuilder);
|
||
}
|
||
|
||
public IAlternativeTransformStage WithLinks(IEnumerable<SourceLink> links) {
|
||
CtxBuilder.WithLinks(links);
|
||
return new TransformStage(Source, Data, CtxBuilder);
|
||
}
|
||
|
||
public ILinkStage WithRange(Range range) {
|
||
if (linksFrozen)
|
||
throw new InvalidOperationException($"WithRange must be called before WithLinkGenerator");
|
||
if (range.End.Value < range.Start.Value)
|
||
throw new ArgumentOutOfRangeException(nameof(range), $" start must be < end");
|
||
var template = Data.Templates[Source.Key];
|
||
var stateChanger = new NumberedStateChanger(template.Factory.Behavior);
|
||
endState = Initial.Copy();
|
||
stateChanger.Apply(Initial, range.Start.Value - 1);
|
||
stateChanger.Apply(endState, range.End.Value - 1);
|
||
return this;
|
||
}
|
||
}
|
||
|
||
private sealed record TransformStage(
|
||
WebResource Source,
|
||
BeamDataDictionary Data,
|
||
DownloadContextBuilder<RawType> CtxBuilder) : ITransformStage, IAlternativeTransformStage {
|
||
public IContextStage WithTransformer(Func<DataBindings, AsyncTransformer<RawType, OutType>> factory) {
|
||
var transformer = factory(Data.Bindings[Source.Bindings]);
|
||
return new ContextStage(CtxBuilder, transformer);
|
||
}
|
||
|
||
public IContextStage WithTransformer(AsyncTransformer<RawType, OutType> transformer) {
|
||
return new ContextStage(CtxBuilder, transformer);
|
||
}
|
||
}
|
||
|
||
private sealed class ContextStage : IContextStage {
|
||
private readonly DownloadContextBuilder<RawType> _ctxBuilder;
|
||
private readonly AsyncTransformer<RawType, OutType> _transformer;
|
||
private int _parallelism = 4;
|
||
private bool _useFragments = false;
|
||
private AsyncManipulator? _useManipulator = null;
|
||
|
||
public ContextStage(DownloadContextBuilder<RawType> ctxBuilder, AsyncTransformer<RawType, OutType> transformer) {
|
||
_ctxBuilder = ctxBuilder;
|
||
_transformer = transformer;
|
||
}
|
||
|
||
public IContextStage Configure(Action<DownloadContextBuilder<RawType>> configure) {
|
||
configure(_ctxBuilder);
|
||
return this;
|
||
}
|
||
|
||
public IContextStage WithParallelism(int degree) {
|
||
_parallelism = Math.Max(1, degree);
|
||
return this;
|
||
}
|
||
|
||
public IContextStage WithTimeout(TimeSpan timeout) {
|
||
_ctxBuilder.WithTimeOut(timeout);
|
||
return this;
|
||
}
|
||
|
||
public IContextStage WithRetryReporter(IProgress<RetryReport> reporter) {
|
||
_ctxBuilder.WithRetryReporter(reporter);
|
||
return this;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Uses fragments to download multiple links in parallel. This strategy is mutually exclusive with <see cref="UsePuppet(AsyncManipulator)"/>
|
||
/// </summary>
|
||
/// <returns></returns>
|
||
public IContextStage UseFragments() {
|
||
if (_useManipulator is not null)
|
||
_useManipulator = null;
|
||
|
||
_useFragments = true;
|
||
return this;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Use a puppet browser to download the links. This strategy is mutually exclusive with <see cref="UseFragments"/>
|
||
/// </summary>
|
||
/// <param name="manipulator">The page manipulator</param>
|
||
/// <returns></returns>
|
||
public IContextStage UsePuppet(AsyncManipulator manipulator) {
|
||
if (_useFragments)
|
||
_useFragments = false;
|
||
|
||
_useManipulator = manipulator;
|
||
return this;
|
||
}
|
||
|
||
private object ConstructUnitDownloader(DownloadContext<RawType> context) {
|
||
return (_useFragments, _useManipulator, _transformer, context.AsyncFailurePredicates) switch {
|
||
// ──────────────── fragmented HTML ────────────────
|
||
(true, _, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||
=> new UnitFragmentDownloader<OutType>(
|
||
context.Web,
|
||
asyncHtmlTransformer,
|
||
documentFailurePredicates,
|
||
_parallelism,
|
||
context.DownloadLogger),
|
||
// ──────────────── fragmented binary ────────────────
|
||
(true, _, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||
=> new UnitFragmentDownloaderBinary<OutType>(
|
||
context.Client,
|
||
asyncBinaryTransformer,
|
||
responseFailurePredicates,
|
||
_parallelism,
|
||
context.DownloadLogger),
|
||
// ──────────────── single HTML ────────────────
|
||
(false, null, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||
=> new UnitDownloader<OutType>(
|
||
context.Web,
|
||
asyncHtmlTransformer,
|
||
documentFailurePredicates),
|
||
// ──────────────── single binary ────────────────
|
||
(false, null, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||
=> new UnitDownloaderBinary<OutType>(
|
||
context.Client,
|
||
asyncBinaryTransformer,
|
||
responseFailurePredicates),
|
||
// ──────────────── single puppet binary ────────────────
|
||
(false, AsyncManipulator manipulator, AsyncTransformer<ByteDocument, OutType> asyncBinaryTransformer,
|
||
AsyncDownloadFailurePredicate<ByteDocument>[] responseFailurePredicates)
|
||
=> new PuppetUnitDownloader<OutType>(
|
||
context.Client,
|
||
manipulator,
|
||
asyncBinaryTransformer,
|
||
responseFailurePredicates),
|
||
// ──────────────── single puppet HTML ────────────────
|
||
(false, AsyncManipulator manipulator, AsyncTransformer<HtmlDocument, OutType> asyncHtmlTransformer,
|
||
AsyncDownloadFailurePredicate<HtmlDocument>[] documentFailurePredicates)
|
||
=> new PuppetUnitPageDownloader<OutType>(
|
||
context.Web,
|
||
manipulator,
|
||
asyncHtmlTransformer,
|
||
documentFailurePredicates),
|
||
_ => throw new Exception($"Unsupported transformer / failure-predicate combination. Missing pattern: {_useFragments} , {_transformer.GetType().AsUniqueName()} , {context.AsyncFailurePredicates?.GetType().AsUniqueName()}"),
|
||
};
|
||
}
|
||
|
||
private IAsyncEnumerator<Ordered<OutType>> ConstructDownloader(DownloadContext<RawType> context) {
|
||
var copyOfContext = context.CreateBuilder().Build();
|
||
return _useFragments switch {
|
||
true => new SequentialFragmentDownloader<RawType, OutType>(
|
||
copyOfContext,
|
||
ctx => (IUnitDownloader<Fragment<Ordered<OutType>>>)ConstructUnitDownloader(ctx),
|
||
context.DownloadLogger).UnwrapFragmented(),
|
||
false => new SequentialDownloader<RawType, OutType>(
|
||
copyOfContext,
|
||
ctx => (IUnitDownloader<OutType>)ConstructUnitDownloader(ctx),
|
||
context.DownloadLogger).WrapOrdered()
|
||
};
|
||
}
|
||
|
||
public DownloadEnumerable<OutType> Build() {
|
||
var context = _ctxBuilder.Build();
|
||
var enumerable = new DownloadEnumerable<OutType>(ConstructDownloader(context));
|
||
return enumerable;
|
||
}
|
||
}
|
||
}
|
||
}
|