diff --git a/Beam.Dynamic/Beam.Dynamic.csproj b/Beam.Dynamic/Beam.Dynamic.csproj index 7dc4e64..2a6cd5c 100644 --- a/Beam.Dynamic/Beam.Dynamic.csproj +++ b/Beam.Dynamic/Beam.Dynamic.csproj @@ -4,19 +4,21 @@ net9.0 enable enable + Beam Dynamic + aeqw89 + qwsdcvghyu + + Beam utilities facilitating dynamic fetching of elements of webpages + https://github.com/qwsdcvghyu89/Beam + aeqw89.Beam.Dynamic - - - - - - ..\..\aeqw89.DataKeys\aeqw89.DataKeys\bin\Debug\net9.0\aeqw89.DataKeys.dll - - - ..\..\aeqw89.PersistentData\aeqw89.PersistentData\bin\Release\net9.0\aeqw89.PersistentData.dll - + + + + + diff --git a/Beam.Dynamic/Binding.cs b/Beam.Dynamic/Binding.cs index c4897d6..a66624f 100644 --- a/Beam.Dynamic/Binding.cs +++ b/Beam.Dynamic/Binding.cs @@ -16,6 +16,7 @@ namespace Beam.Dynamic { public string? ArrayDelimiters { get; set; } public string? XPath { get; set; } public string? CssPath { get; set; } + public string? Text { get; set; } private IDataProvider? Provider_; public IDataProvider? Provider { get => Provider_; diff --git a/Beam.Dynamic/DataBindings.cs b/Beam.Dynamic/DataBindings.cs index d0c89c9..95ad480 100644 --- a/Beam.Dynamic/DataBindings.cs +++ b/Beam.Dynamic/DataBindings.cs @@ -1,7 +1,7 @@ using HtmlAgilityPack; namespace Beam.Dynamic { - public class DataBindings { + public record class DataBindings { public Binding? Title { get; set; } public Binding? Authors { get; set; } public Binding? Description { get; set; } @@ -9,7 +9,7 @@ namespace Beam.Dynamic { public Binding? Language { get; set; } public Binding? Tags { get; set; } - public ResolvedBindings Resolve(HtmlDocument doc) { + public virtual ResolvedBindings Resolve(HtmlDocument doc) { return new ResolvedBindings() { Title = Title?.Resolve(doc), Authors = Authors?.Resolve(doc) ?? Array.Empty(), @@ -21,7 +21,7 @@ namespace Beam.Dynamic { } } - public class ResolvedBindings { + public record class ResolvedBindings { public string? Title { get; set; } public string[]? Authors { get; set; } public string? Description { get; set; } diff --git a/Beam.Dynamic/IDataProvider.cs b/Beam.Dynamic/IDataProvider.cs index e2e43b8..95e961e 100644 --- a/Beam.Dynamic/IDataProvider.cs +++ b/Beam.Dynamic/IDataProvider.cs @@ -1,8 +1,8 @@ using HtmlAgilityPack; namespace Beam.Dynamic { - [System.Text.Json.Serialization.JsonDerivedType(typeof(ParagraphedContentDataProvider), 20)] - [System.Text.Json.Serialization.JsonDerivedType(typeof(ListContentDataProvider), 21)] + [System.Text.Json.Serialization.JsonDerivedType(typeof(ParagraphedContentDataProvider), "paragraphed-data-provider")] + [System.Text.Json.Serialization.JsonDerivedType(typeof(ListContentDataProvider), "list-data-provider")] public interface IDataProvider { public string Get(HtmlDocument document); public HtmlNode? GetNode(HtmlDocument document); diff --git a/Beam.Dynamic/OnlineCleaner.cs b/Beam.Dynamic/OnlineCleaner.cs index 6ccd781..7a8097d 100644 --- a/Beam.Dynamic/OnlineCleaner.cs +++ b/Beam.Dynamic/OnlineCleaner.cs @@ -1,6 +1,8 @@ using HtmlAgilityPack; +using Microsoft.Recognizers.Text.Number; using System; using System.Collections.Generic; +using System.Globalization; using System.Linq; using System.Text; using System.Text.RegularExpressions; @@ -25,6 +27,19 @@ namespace Beam.Dynamic { }); } + public static List ParseNumbers(string text, string from) { + var results = NumberRecognizer.RecognizeNumber(text, from, NumberOptions.None, false); + var resolved = results.Select((x) => { + if (x.Resolution.TryGetValue("value", out var value) && double.TryParse(value.ToString(), out var number)) + return (int?)number; + return null; + }) + .Where((x) => x.HasValue).ToList(); + if (resolved.Count == 0) + return []; + return resolved.Select((x) => x!.Value).ToList(); + } + public static string Clean(string? onlineText) { if (string.IsNullOrWhiteSpace(onlineText)) return ""; diff --git a/Beam.Exports/Beam.Exports.csproj b/Beam.Exports/Beam.Exports.csproj index 9d971be..1ad0212 100644 --- a/Beam.Exports/Beam.Exports.csproj +++ b/Beam.Exports/Beam.Exports.csproj @@ -4,10 +4,17 @@ net9.0 enable enable + Beam.Exports + aeqw89 + qwsdcvghyu + Beam library that facilitates exporting different kinds of views for IDocuments + https://github.com/qwsdcvghyu89/Beam + https://github.com/qwsdcvghyu89/Beam + aeqw89.Beam.Exports - + diff --git a/Beam.Puppeteer/Beam.Puppeteer.csproj b/Beam.Puppeteer/Beam.Puppeteer.csproj new file mode 100644 index 0000000..6e23fcf --- /dev/null +++ b/Beam.Puppeteer/Beam.Puppeteer.csproj @@ -0,0 +1,17 @@ + + + + net9.0 + enable + enable + + + + + + + + + + + diff --git a/Beam.Puppeteer/PuppetedUnitDownloader.cs b/Beam.Puppeteer/PuppetedUnitDownloader.cs new file mode 100644 index 0000000..dd571af --- /dev/null +++ b/Beam.Puppeteer/PuppetedUnitDownloader.cs @@ -0,0 +1,39 @@ + +using HtmlAgilityPack; +using Microsoft.Playwright; + +namespace Beam.Puppeteer { + public class PuppetContext(IPlaywright playwright, IBrowser browser) { + public IPlaywright Playwright { get; set; } = playwright; + public IBrowser Browser { get; set; } = browser; + } + + public class PuppetUnitDownloader : UnitDownloader { + public PuppetContext Context { get; } + + public PuppetUnitDownloader(PuppetContext pc, DownloadContext context) + : base(context.Web, context.AsyncTranformer, context.AsyncFailurePredicates) { + Context = pc; + } + + protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) { + var page = await Context.Browser.NewPageAsync(); + try { + var content = await page.ContentAsync(); + await page.CloseAsync(); + + HtmlDocument doc = new(); + doc.LoadHtml(content); + var transformed = await Transformer(doc); + if (FailurePredicates is null || !(await IsFailure(doc))) + return (true, transformed); + return (false, default); + } catch (Exception) { + return (false, default); + } finally { + if (!page.IsClosed) + await page.CloseAsync(); + } + } + } +} diff --git a/Beam.Temporary.Cli/Beam.Temporary.Cli.csproj b/Beam.Temporary.Cli/Beam.Temporary.Cli.csproj index 1482db0..a404a59 100644 --- a/Beam.Temporary.Cli/Beam.Temporary.Cli.csproj +++ b/Beam.Temporary.Cli/Beam.Temporary.Cli.csproj @@ -5,9 +5,22 @@ net9.0 enable enable + + Beam.Temporary.Cli + aeqw89 + qwsdcvghyu + A temporary CLI for Beam providing several useful mechanisms + https://github.com/qwsdcvghyu89/Beam + https://github.com/qwsdcvghyu89/Beam + aeqw89.Beam.Temporary.Cli + + + + + @@ -16,19 +29,4 @@ - - - - - - - - - ..\..\aeqw89.DataKeys\aeqw89.DataKeys\bin\Debug\net9.0\aeqw89.DataKeys.dll - - - ..\..\aeqw89.PersistentData\aeqw89.PersistentData\bin\Release\net9.0\aeqw89.PersistentData.dll - - - diff --git a/Beam.Temporary.Cli/DownloadBuilder.cs b/Beam.Temporary.Cli/DownloadBuilder.cs index fdf314b..8d71ead 100644 --- a/Beam.Temporary.Cli/DownloadBuilder.cs +++ b/Beam.Temporary.Cli/DownloadBuilder.cs @@ -1,5 +1,6 @@ using aeqw89.DataKeys; using Beam.Dynamic; +using Beam; using Microsoft.Extensions.Logging; using System; using System.Collections.Generic; @@ -24,17 +25,18 @@ namespace Beam.Temporary.Cli { public interface ILinkStage { ITransformStage WithLink(); ITransformStage WithLinkGenerator(); + ILinkStage WithRange(Range range); } public interface ITransformStage { - IContextStage WithTransformer(Func> factory); + IContextStage WithTransformer(Func> factory); } - public interface IContextStage { - IContextStage Configure(Action> configure); - IContextStage WithParallelism(int degree); - IContextStage WithTimeout(TimeSpan timeout); - IContextStage WithRetryReporter(IProgress reporter); + public interface IContextStage { + IContextStage Configure(Action> configure); + IContextStage WithParallelism(int degree); + IContextStage WithTimeout(TimeSpan timeout); + IContextStage WithRetryReporter(IProgress reporter); DownloadEnumerable Build(); } @@ -61,7 +63,7 @@ namespace Beam.Temporary.Cli { initial = textRecord.Resource.MetaTemplateInitialData ?? throw new InvalidOperationException("Meta template data missing."); } else { source = textRecord.AssociatedSource ?? throw new InvalidOperationException($"Text source missing for '{novelKey}'."); - initial = textRecord.Resource.TemplateInitialData ?? throw new InvalidOperationException("Template initial data missing."); + initial = textRecord.Resource.TemplateInitialData; } return (source, initial); @@ -74,6 +76,10 @@ namespace Beam.Temporary.Cli { State Initial, BeamDataDictionary Data, DownloadContextBuilder CtxBuilder) : ILinkStage { + + private State? endState; + private bool linksFrozen = false; + public ITransformStage WithLink() { var link = Data.Templates[Source.Key].Builder.Build(Initial); CtxBuilder.WithLinks(new[] { link }); @@ -85,28 +91,45 @@ namespace Beam.Temporary.Cli { var generator = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator( template.Builder, new NumberedStateChanger(template.Factory.Behavior), - Initial)); + Initial, endState)); CtxBuilder.WithLinks(generator); + linksFrozen = true; return new TransformStage(Source, Data, CtxBuilder); } + + public ILinkStage WithRange(Range range) { + if (linksFrozen) + throw new InvalidOperationException($"WithRange must be called before WithLinkGenerator"); + if (range.End.Value < range.Start.Value) + throw new ArgumentOutOfRangeException(nameof(range), $" start must be < end"); + var template = Data.Templates[Source.Key]; + var stateChanger = new NumberedStateChanger(template.Factory.Behavior); + endState = Initial.Copy(); + stateChanger.Apply(Initial, range.Start.Value - 1); + stateChanger.Apply(endState, range.End.Value - 1); + return this; + } } private sealed record TransformStage( WebResource Source, BeamDataDictionary Data, DownloadContextBuilder CtxBuilder) : ITransformStage { - public IContextStage WithTransformer(Func> factory) { + public IContextStage WithTransformer(Func> factory) { var transformer = factory(Data.Bindings[Source.Bindings]); - CtxBuilder.WithTransformer(transformer); - return new ContextStage(CtxBuilder); + return new ContextStage(CtxBuilder, transformer); } } - private sealed class ContextStage : IContextStage { + private sealed class ContextStage : IContextStage { private readonly DownloadContextBuilder _ctxBuilder; + private readonly Func _transformer; private int _parallelism = 4; - public ContextStage(DownloadContextBuilder ctxBuilder) => _ctxBuilder = ctxBuilder; + public ContextStage(DownloadContextBuilder ctxBuilder, Func transformer) { + _ctxBuilder = ctxBuilder; + _transformer = transformer; + } public IContextStage Configure(Action> configure) { configure(_ctxBuilder); @@ -134,7 +157,7 @@ namespace Beam.Temporary.Cli { context, ctx => new UnitFragmentDownloader( context.Web, - context.AsyncTranformer, + _transformer, context.AsyncFailurePredicates, _parallelism, context.DownloadLogger), diff --git a/Beam.Temporary.Cli/LinkCollection.cs b/Beam.Temporary.Cli/LinkCollection.cs new file mode 100644 index 0000000..93cc059 --- /dev/null +++ b/Beam.Temporary.Cli/LinkCollection.cs @@ -0,0 +1,13 @@ +using aeqw89.DataKeys; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Beam.Temporary.Cli { + internal class LinkCollection(DataKey key, List links) { + public DataKey Key { get; set; } = key; + public List Links { get; set; } = links; + } +} diff --git a/Beam.Temporary.Cli/NovelStatics.cs b/Beam.Temporary.Cli/NovelStatics.cs index 9dd621d..f58163d 100644 --- a/Beam.Temporary.Cli/NovelStatics.cs +++ b/Beam.Temporary.Cli/NovelStatics.cs @@ -50,8 +50,8 @@ namespace Beam.Temporary.Cli { FriendlyName = "My House Of Horrors", AssociatedSource = wdsAgg, AssociatedMetaSource = wdsAux, - TemplateInitialData = new State(["24349", "2896325"]), - MetaTemplateInitialData = new State(["24349"]) + TemplateInitialData = new ImmutableState(["24349", "2896325"]), + MetaTemplateInitialData = new ImmutableState(["24349"]) }; sdd.Novels.TryAdd(novel.Key, novel); diff --git a/Beam.Temporary.Cli/Program.cs b/Beam.Temporary.Cli/Program.cs index 1a01804..0c1917d 100644 --- a/Beam.Temporary.Cli/Program.cs +++ b/Beam.Temporary.Cli/Program.cs @@ -49,6 +49,8 @@ namespace Beam.Temporary.Cli { NovelStatics.Define_WoDuShu_HouseOfHorrors(BeamData); ClassicTemplates.Register(BeamData); + await sharedContext.ForceSave(); + CancellationTokenSource cts = new(); HtmlTransformer ArticleDataTransformer(DataBindings? binding) => (x) => { @@ -75,38 +77,23 @@ namespace Beam.Temporary.Cli { }; var novel = new DataKey("novels:house_of_horrors"); - var context_aux = Architecture.GetMeta(web, novel, BeamData, cts.Token); - context_aux.RetryReporter = new Progress((x) => Console.WriteLine($"Failed. Trying again. {x.TryNumber}")); - var metaDownloader = new DownloadEnumerable( - new SequentialFragmentDownloader( - context_aux, - (c) => new UnitFragmentDownloader(c.Web, c.AsyncTranformer, c.AsyncFailurePredicates, 4, logger), - logger) - .UnwrapFragmented()); - var metadata = (await metaDownloader.FirstAsync()); var metadata2 = await DownloadBuilder.FromMeta(novel, BeamData) .WithLink() .WithTransformer(ArticleDataTransformer) .Configure((x) => x + .WithDownloadLogger(logger) .WithRetryReporter(new Progress()) .WithTimeOut(TimeSpan.FromSeconds(15))) .Build() .FirstAsync(); - var context = Architecture.GetTextRecord(web, novel, BeamData, metadata.Data, cts.Token); - context.DownloadReporter = new Progress((x) => Console.WriteLine(x)); - var downloader = new DownloadEnumerable( - new SequentialFragmentDownloader( - context, - (c) => new UnitFragmentDownloader(c.Web, c.AsyncTranformer, c.AsyncFailurePredicates, 4, logger), - logger) - .UnwrapFragmented()); - var downloader2 = DownloadBuilder.FromText(novel, BeamData) + .WithRange(1..5) .WithLinkGenerator() .WithTransformer((x) => DocumentTransformer(x, metadata2.Data)) .Configure((x) => x + .WithDownloadLogger(logger) .WithDownloadReporter(new Progress((x) => logger.LogInformation(x.ToString()))) .WithTimeOut(TimeSpan.FromSeconds(15)) ) @@ -122,11 +109,16 @@ namespace Beam.Temporary.Cli { continue; if (meta is not ArticleData articleMetaData) continue; + if (!download.Data.MetaData.TryGetValue(Architecture.BookKey, out var bookmeta)) + continue; + if (meta is not ArticleData bookMetaData) + continue; //Console.WriteLine($"Title: {data.Name}"); //Console.WriteLine($"Description: {data.Description}"); //Console.WriteLine($"Categories: {data.Categories.Aggregate((x, y) => $"{x}; {y}")}"); //Console.WriteLine($"Authors: {data.Authors.Aggregate((x,y) => $"{x}; {y}")}"); Console.WriteLine($"Chapter title: {articleMetaData.Name}"); + Console.WriteLine($"Book title: {bookMetaData.Name}"); //Console.WriteLine($"Content: {download}"); //translationTasks.Add(Task.Run(async () => { diff --git a/Beam.Temporary.Cli/StateChangerFactory.cs b/Beam.Temporary.Cli/StateChangerFactory.cs index caaf663..339b503 100644 --- a/Beam.Temporary.Cli/StateChangerFactory.cs +++ b/Beam.Temporary.Cli/StateChangerFactory.cs @@ -7,6 +7,7 @@ using System.Threading.Tasks; namespace Beam.Temporary.Cli { public class StateChangerFactory { + [JsonIgnore] public IStateChangeBehaviour Behavior => FactoryTable[StateChangerKey](); [JsonInclude] diff --git a/Beam.Temporary.Cli/TextResource.cs b/Beam.Temporary.Cli/TextResource.cs index 0d39516..40aa799 100644 --- a/Beam.Temporary.Cli/TextResource.cs +++ b/Beam.Temporary.Cli/TextResource.cs @@ -13,8 +13,8 @@ namespace Beam.Temporary.Cli { public string? FriendlyName { get; set; } public DataKey? AssociatedSource { get; set; } public DataKey? AssociatedMetaSource { get; set; } - public required State TemplateInitialData { get; set; } - public State? MetaTemplateInitialData { get; set; } + public required ImmutableState TemplateInitialData { get; set; } + public ImmutableState? MetaTemplateInitialData { get; set; } public TextResourceRecord ToRecord(BeamDataDictionary sdd) { return new(this, diff --git a/Beam.Tests/OnlineCleaner.Tests.cs b/Beam.Tests/OnlineCleaner.Tests.cs new file mode 100644 index 0000000..344f488 --- /dev/null +++ b/Beam.Tests/OnlineCleaner.Tests.cs @@ -0,0 +1,34 @@ +using Beam.Dynamic; +using Microsoft.Recognizers.Text; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Beam.Tests { + public class OnlineCleanerTests { + [Fact] + public void Should_LocalizeArabic() { + const string test = "1234"; + + List localized = OnlineCleaner.ParseNumbers(test, Culture.English); + + Assert.Single(localized); + Assert.Equal(1234, localized[0]); + } + + [Fact] + public void Should_LocalizeIndian() { + const string test = "九一五"; + + List localized = OnlineCleaner.ParseNumbers(test, Culture.Chinese); + + Assert.Single(localized); + Assert.Equal(915, localized[0]); + } + + + } +} diff --git a/Beam.Tests/UnitTest1.cs b/Beam.Tests/UnitTest1.cs index d10ff40..4120de6 100644 --- a/Beam.Tests/UnitTest1.cs +++ b/Beam.Tests/UnitTest1.cs @@ -1,7 +1,12 @@ -namespace Beam.Tests { +using System.Globalization; + +namespace Beam.Tests { public class UnitTest1 { [Fact] public void Test1() { + const string test = "九一五"; + + Assert.True(int.TryParse(test, CultureInfo.GetCultureInfo("zh-Hans").NumberFormat, out var number)); } } } diff --git a/Beam.sln b/Beam.sln index 0d6084d..e482b22 100644 --- a/Beam.sln +++ b/Beam.sln @@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Exports", "Beam.Export EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Tests", "Beam.Tests\Beam.Tests.csproj", "{E26800C2-0518-49E8-88DF-A0B6ED97D4AB}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Puppeteer", "Beam.Puppeteer\Beam.Puppeteer.csproj", "{1A967563-D643-401D-A031-68DD43FACE8D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -39,8 +41,15 @@ Global {E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Debug|Any CPU.Build.0 = Debug|Any CPU {E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Release|Any CPU.ActiveCfg = Release|Any CPU {E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Release|Any CPU.Build.0 = Release|Any CPU + {1A967563-D643-401D-A031-68DD43FACE8D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1A967563-D643-401D-A031-68DD43FACE8D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1A967563-D643-401D-A031-68DD43FACE8D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1A967563-D643-401D-A031-68DD43FACE8D}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {363CAF17-9E48-45B9-AA3F-78BB5E95DB0E} + EndGlobalSection EndGlobal diff --git a/Beam/Beam.csproj b/Beam/Beam.csproj index 8d25a7b..f5421bc 100644 --- a/Beam/Beam.csproj +++ b/Beam/Beam.csproj @@ -4,22 +4,26 @@ net9.0 enable enable + + True + Beam + aeqw89 + qwsdcvghyu + A library for downloading internet resources + https://github.com/qwsdcvghyu89/Beam + https://github.com/qwsdcvghyu89/Beam + aeqw89.Beam - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - + - - ..\..\aeqw89.DataKeys\aeqw89.DataKeys\bin\Debug\net9.0\aeqw89.DataKeys.dll - + diff --git a/Beam/DownloadContext.cs b/Beam/DownloadContext.cs index ebd34e1..ba4789d 100644 --- a/Beam/DownloadContext.cs +++ b/Beam/DownloadContext.cs @@ -5,12 +5,13 @@ using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; -using FluentBuilder; using Microsoft.Extensions.Logging; namespace Beam { public delegate T HtmlTransformer(HtmlDocument doc); + public delegate Task AsyncTransformer(T elem); public delegate Task AsyncHtmlTransformer(HtmlDocument doc); + public delegate Task AsyncBinaryTransformer(byte[] bin); public class DownloadContext : IDisposable { private bool disposedValue; @@ -19,8 +20,6 @@ namespace Beam { => DownloadContextBuilder.FromContext(this); public HtmlWeb Web { get; } - public HtmlTransformer Transformer { get; } - public AsyncHtmlTransformer AsyncTranformer { get; } public IProgress? DownloadReporter { get; set; } public IProgress? RetryReporter { get; set; } public AsyncDownloadFailurePredicate?[]? AsyncFailurePredicates { get; } @@ -33,8 +32,6 @@ namespace Beam { public DownloadContext(HtmlWeb web, IEnumerable links, CancellationToken cancellationToken = default, - HtmlTransformer? transformer = null, - AsyncHtmlTransformer? asyncTransformer = null, IProgress? downloadReporter = null, IProgress? retryReporter = null, AsyncDownloadFailurePredicate?[]? asyncFailurePredicates = null, @@ -46,16 +43,6 @@ namespace Beam { Web = web; Links = links; CancellationToken = cancellationToken; - if (transformer is null && asyncTransformer is null) - throw new ArgumentException($"Either {nameof(transformer)} or {nameof(asyncTransformer)} must be not null."); - - Transformer = transformer!; - AsyncTranformer = asyncTransformer!; - if (transformer is null && asyncTransformer is not null) - Transformer = (x) => asyncTransformer(x).Result; - if (asyncTransformer is null && transformer is not null) - AsyncTranformer = (x) => Task.FromResult(transformer(x)); - DownloadReporter = downloadReporter; RetryReporter = retryReporter; AsyncFailurePredicates = asyncFailurePredicates; diff --git a/Beam/DownloadContextBuilder.cs b/Beam/DownloadContextBuilder.cs index 9fd67ca..ac12fc4 100644 --- a/Beam/DownloadContextBuilder.cs +++ b/Beam/DownloadContextBuilder.cs @@ -10,8 +10,6 @@ namespace Beam { public class DownloadContextBuilder { private HtmlWeb _web; - private HtmlTransformer _transformer; - private AsyncHtmlTransformer _asyncTransformer; private IProgress? _downloadReporter; private IProgress? _retryReporter; private AsyncDownloadFailurePredicate?[]? _asyncFailurePredicates; @@ -34,16 +32,6 @@ namespace Beam { return this; } - public DownloadContextBuilder WithTransformer(HtmlTransformer transformer) { - _transformer = transformer; - return this; - } - - public DownloadContextBuilder WithAsyncTransformer(AsyncHtmlTransformer asyncTransformer) { - _asyncTransformer = asyncTransformer; - return this; - } - public DownloadContextBuilder WithDownloadReporter(IProgress downloadReporter) { _downloadReporter = downloadReporter; return this; @@ -91,8 +79,6 @@ namespace Beam { web: _web, links: _links, cancellationToken: _cancellationToken, - transformer: _transformer, - asyncTransformer: _asyncTransformer, downloadReporter: _downloadReporter, retryReporter: _retryReporter, asyncFailurePredicates: _asyncFailurePredicates, @@ -115,8 +101,6 @@ namespace Beam { return new DownloadContextBuilder(existing.Web) .WithLinks(existing.Links) .WithCancellationToken(existing.CancellationToken) - .WithTransformer(existing.Transformer) - .WithAsyncTransformer(existing.AsyncTranformer) .WithDownloadReporter(existing.DownloadReporter!) .WithRetryReporter(existing.RetryReporter!) .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty>()) diff --git a/Beam/ImmutableState.cs b/Beam/ImmutableState.cs new file mode 100644 index 0000000..1ba4273 --- /dev/null +++ b/Beam/ImmutableState.cs @@ -0,0 +1,23 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Beam { + public readonly struct ImmutableState(object[] state) { + readonly object[] state = state; + + public readonly Span GetState() => state; + + public readonly State Copy() + => new((object[])state.Clone()); + + public readonly object this[Index i] { + get => state[i]; + } + + public static implicit operator State(ImmutableState state) + => state.Copy(); + } +} diff --git a/Beam/OrderedSourceLinkGenerator.cs b/Beam/OrderedSourceLinkGenerator.cs index 5d32eb3..49606d5 100644 --- a/Beam/OrderedSourceLinkGenerator.cs +++ b/Beam/OrderedSourceLinkGenerator.cs @@ -13,15 +13,17 @@ namespace Beam { public class OrderedSourceLinkGenerator : IEnumerator { public SourceLinkBuilder Builder { get; set; } public NumberedStateChanger Behaviour { get; } + + private State? EndState; private State InitialState; - public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, params object[] initialState) : this(builder, behaviour, new State(initialState)) { } - public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, State initialState) { + public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, State initialState, State? endState = null) { Builder = builder; Behaviour = behaviour; - InitialState = initialState; + InitialState = initialState.Copy(); + EndState = endState?.Copy(); State = InitialState.Copy(); Reset(); @@ -37,8 +39,15 @@ namespace Beam { } public bool MoveNext() { + if (!Current.HasValue || (EndState is not null && State.GetState().SequenceEqual(EndState.GetState()))) { + Current = SourceLink.InvalidLink; + return false; + } Behaviour.Apply(State, 1); Current = Builder.Build(State); + if (!Current.HasValue || (EndState is not null && State.GetState().SequenceEqual(EndState.GetState()))) { + return false; + } return Current.HasValue; } diff --git a/Beam/SequentialDownloader.cs b/Beam/SequentialDownloader.cs index 32f71b8..b0d4896 100644 --- a/Beam/SequentialDownloader.cs +++ b/Beam/SequentialDownloader.cs @@ -35,13 +35,16 @@ namespace Beam { links.Add(new Ordered(LinksEnumerator.Current.Link.ToString(), LastOrder++)); - while (LinksEnumerator.MoveNext() && links.Count < idealLinkCount) + while (LinksEnumerator.MoveNext() && LinksEnumerator.Current != SourceLink.InvalidLink && links.Count < idealLinkCount) links.Add(new Ordered(LinksEnumerator.Current.Link.ToString(), LastOrder++)); if (links.Count == 0) { Logger?.LogInformation("Out of links!"); return false; } + if (links.Any((x) => x.Data == SourceLink.InvalidLink.Link.ToString())) + return false; + var (result, downloadedT) = await unit.TryDownload( links.ToArray(), Context.CancellationToken, diff --git a/Beam/State.cs b/Beam/State.cs index 4c05f44..eaece0e 100644 --- a/Beam/State.cs +++ b/Beam/State.cs @@ -17,6 +17,6 @@ namespace Beam { public object this[Index i] { get => state[i]; set => state[i] = value; - } + } } } diff --git a/Beam/UnitDownloaderBinary.cs b/Beam/UnitDownloaderBinary.cs new file mode 100644 index 0000000..17434b3 --- /dev/null +++ b/Beam/UnitDownloaderBinary.cs @@ -0,0 +1,73 @@ +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; + +namespace Beam { + /// + /// A download-managing class that retrieves binary data through , + /// applies an , and supports failure detection + /// plus exponential-back-off retries. Safe to instantiate per request. + /// + public class UnitDownloaderBinary( + HttpClient client, + AsyncBinaryTransformer transformer, + AsyncDownloadFailurePredicate?[]? failurePredicates = null) + : IUnitDownloader { + public HttpClient Client { get; } = client; + public virtual AsyncBinaryTransformer Transformer { get; } = transformer; + public virtual AsyncDownloadFailurePredicate?[]? FailurePredicates { get; } = failurePredicates; + + public int LinksPerDownload { get; } = 1; + + /// Runs all configured failure predicates in parallel on the raw HTTP response. + protected virtual async Task IsFailure(HttpResponseMessage response) { + if (FailurePredicates is null) return false; + + var failed = false; + await Parallel.ForEachAsync(FailurePredicates, async (pred, _) => { + if (failed || pred is null) return; + if (await pred(response)) failed = true; + }); + return failed; + } + + /// One attempt without retries or back-off. + protected virtual async Task<(bool Success, T? Result)> TryDownloadWithNoRetries(string link, CancellationToken ct) { + try { + using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct); + if (!response.IsSuccessStatusCode) return (false, default); + + if (await IsFailure(response)) return (false, default); + + var bytes = await response.Content.ReadAsByteArrayAsync(ct); + return (true, await Transformer(bytes)); + } catch { + return (false, default); + } + } + + public async Task<(bool, T?)> TryDownload( + Ordered[] link, + CancellationToken ct, + int maximumRetryCount = 7, + IProgress? tryProgress = null) { + if (link.Length == 0) return (false, default); + + T? result = default; + var attempt = 0; + + while (attempt < maximumRetryCount) { + ct.ThrowIfCancellationRequested(); + + (var success, result) = await TryDownloadWithNoRetries(link[0].Data, ct); + if (success && result is not null) return (true, result); + + ++attempt; + tryProgress?.Report(new RetryReport(attempt, link[0].Data)); + await Task.Delay((int)Math.Pow(2, attempt) * 1000, ct); + } + + return (false, result); + } + } +} diff --git a/Beam/UnitFragmentDownloader.cs b/Beam/UnitFragmentDownloader.cs index a297630..466768d 100644 --- a/Beam/UnitFragmentDownloader.cs +++ b/Beam/UnitFragmentDownloader.cs @@ -13,11 +13,12 @@ namespace Beam { AsyncHtmlTransformer transformer, AsyncDownloadFailurePredicate?[]? failurePredicate = null, int fragmentSize = 4, - ILogger? logger = null) { + ILogger? logger = null, + IUnitDownloader? internalDownloader = null) { Web = web; Transformer = transformer; FailurePredicate = failurePredicate; - UnitDownloader = new UnitDownloader(Web, Transformer, FailurePredicate); + UnitDownloader = internalDownloader ?? new UnitDownloader(Web, Transformer, FailurePredicate); LinksPerDownload = fragmentSize; Logger = logger; } @@ -28,7 +29,7 @@ namespace Beam { public int LinksPerDownload { get; set; } public ILogger? Logger { get; set; } - private readonly UnitDownloader UnitDownloader; + private readonly IUnitDownloader UnitDownloader; async Task<(bool, Fragment>?)> IUnitDownloader>>.TryDownload(Ordered[] link, CancellationToken ct, int maximumRetryCount, IProgress? tryProgress) { Fragment> fragment = new Fragment>(link.Length);