Enhance project metadata and refactor core classes

Updated project files for `Beam.Dynamic`, `Beam.Exports`, `Beam.Temporary.Cli`, and `Beam` to include additional metadata and specific package versions. Refactored `DataBindings` and `ResolvedBindings` to records, added a new `Text` property in `Binding.cs`, and introduced `ParseNumbers` in `OnlineCleaner`. New classes `PuppetContext` and `PuppetUnitDownloader` added for Playwright integration. Introduced `ImmutableState` struct and `UnitDownloaderBinary` class for improved download management. Updated tests in `UnitTest1.cs` for number localization. Added `Beam.Puppeteer` project to the solution.
This commit is contained in:
qwsdcvghyu89
2025-06-23 02:11:19 +03:00
parent a9a22ea23d
commit 482a46b568
27 changed files with 354 additions and 114 deletions
+12 -10
View File
@@ -4,19 +4,21 @@
<TargetFramework>net9.0</TargetFramework> <TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings> <ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable> <Nullable>enable</Nullable>
<Title>Beam Dynamic</Title>
<Authors>aeqw89</Authors>
<Company>qwsdcvghyu</Company>
<Product />
<Description>Beam utilities facilitating dynamic fetching of elements of webpages</Description>
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
<PackageId>aeqw89.Beam.Dynamic</PackageId>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<ProjectReference Include="..\Beam\Beam.csproj" /> <PackageReference Include="aeqw89.DataKeys" Version="1.0.1" />
</ItemGroup> <PackageReference Include="aeqw89.PersistentData" Version="1.0.0" />
<PackageReference Include="aeqw89.Beam" Version="1.0.0" />
<ItemGroup> <PackageReference Include="HtmlAgilityPack" Version="1.11.72" />
<Reference Include="aeqw89.DataKeys"> <PackageReference Include="Microsoft.Recognizers.Text.Number" Version="1.8.13" />
<HintPath>..\..\aeqw89.DataKeys\aeqw89.DataKeys\bin\Debug\net9.0\aeqw89.DataKeys.dll</HintPath>
</Reference>
<Reference Include="aeqw89.PersistentData">
<HintPath>..\..\aeqw89.PersistentData\aeqw89.PersistentData\bin\Release\net9.0\aeqw89.PersistentData.dll</HintPath>
</Reference>
</ItemGroup> </ItemGroup>
</Project> </Project>
+1
View File
@@ -16,6 +16,7 @@ namespace Beam.Dynamic {
public string? ArrayDelimiters { get; set; } public string? ArrayDelimiters { get; set; }
public string? XPath { get; set; } public string? XPath { get; set; }
public string? CssPath { get; set; } public string? CssPath { get; set; }
public string? Text { get; set; }
private IDataProvider? Provider_; private IDataProvider? Provider_;
public IDataProvider? Provider { public IDataProvider? Provider {
get => Provider_; get => Provider_;
+3 -3
View File
@@ -1,7 +1,7 @@
using HtmlAgilityPack; using HtmlAgilityPack;
namespace Beam.Dynamic { namespace Beam.Dynamic {
public class DataBindings { public record class DataBindings {
public Binding? Title { get; set; } public Binding? Title { get; set; }
public Binding? Authors { get; set; } public Binding? Authors { get; set; }
public Binding? Description { get; set; } public Binding? Description { get; set; }
@@ -9,7 +9,7 @@ namespace Beam.Dynamic {
public Binding? Language { get; set; } public Binding? Language { get; set; }
public Binding? Tags { get; set; } public Binding? Tags { get; set; }
public ResolvedBindings Resolve(HtmlDocument doc) { public virtual ResolvedBindings Resolve(HtmlDocument doc) {
return new ResolvedBindings() { return new ResolvedBindings() {
Title = Title?.Resolve(doc), Title = Title?.Resolve(doc),
Authors = Authors?.Resolve(doc) ?? Array.Empty<string>(), Authors = Authors?.Resolve(doc) ?? Array.Empty<string>(),
@@ -21,7 +21,7 @@ namespace Beam.Dynamic {
} }
} }
public class ResolvedBindings { public record class ResolvedBindings {
public string? Title { get; set; } public string? Title { get; set; }
public string[]? Authors { get; set; } public string[]? Authors { get; set; }
public string? Description { get; set; } public string? Description { get; set; }
+2 -2
View File
@@ -1,8 +1,8 @@
using HtmlAgilityPack; using HtmlAgilityPack;
namespace Beam.Dynamic { namespace Beam.Dynamic {
[System.Text.Json.Serialization.JsonDerivedType(typeof(ParagraphedContentDataProvider), 20)] [System.Text.Json.Serialization.JsonDerivedType(typeof(ParagraphedContentDataProvider), "paragraphed-data-provider")]
[System.Text.Json.Serialization.JsonDerivedType(typeof(ListContentDataProvider), 21)] [System.Text.Json.Serialization.JsonDerivedType(typeof(ListContentDataProvider), "list-data-provider")]
public interface IDataProvider { public interface IDataProvider {
public string Get(HtmlDocument document); public string Get(HtmlDocument document);
public HtmlNode? GetNode(HtmlDocument document); public HtmlNode? GetNode(HtmlDocument document);
+15
View File
@@ -1,6 +1,8 @@
using HtmlAgilityPack; using HtmlAgilityPack;
using Microsoft.Recognizers.Text.Number;
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Globalization;
using System.Linq; using System.Linq;
using System.Text; using System.Text;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
@@ -25,6 +27,19 @@ namespace Beam.Dynamic {
}); });
} }
public static List<int> ParseNumbers(string text, string from) {
var results = NumberRecognizer.RecognizeNumber(text, from, NumberOptions.None, false);
var resolved = results.Select((x) => {
if (x.Resolution.TryGetValue("value", out var value) && double.TryParse(value.ToString(), out var number))
return (int?)number;
return null;
})
.Where((x) => x.HasValue).ToList();
if (resolved.Count == 0)
return [];
return resolved.Select((x) => x!.Value).ToList();
}
public static string Clean(string? onlineText) { public static string Clean(string? onlineText) {
if (string.IsNullOrWhiteSpace(onlineText)) if (string.IsNullOrWhiteSpace(onlineText))
return ""; return "";
+8 -1
View File
@@ -4,10 +4,17 @@
<TargetFramework>net9.0</TargetFramework> <TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings> <ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable> <Nullable>enable</Nullable>
<Title>Beam.Exports</Title>
<Authors>aeqw89</Authors>
<Company>qwsdcvghyu</Company>
<Description>Beam library that facilitates exporting different kinds of views for IDocuments</Description>
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
<PackageId>aeqw89.Beam.Exports</PackageId>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<ProjectReference Include="..\Beam\Beam.csproj" /> <PackageReference Include="aeqw89.Beam" Version="1.0.0" />
</ItemGroup> </ItemGroup>
</Project> </Project>
+17
View File
@@ -0,0 +1,17 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Playwright" Version="1.52.0" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Beam\Beam.csproj" />
</ItemGroup>
</Project>
+39
View File
@@ -0,0 +1,39 @@
using HtmlAgilityPack;
using Microsoft.Playwright;
namespace Beam.Puppeteer {
public class PuppetContext(IPlaywright playwright, IBrowser browser) {
public IPlaywright Playwright { get; set; } = playwright;
public IBrowser Browser { get; set; } = browser;
}
public class PuppetUnitDownloader<T> : UnitDownloader<T> {
public PuppetContext Context { get; }
public PuppetUnitDownloader(PuppetContext pc, DownloadContext<T> context)
: base(context.Web, context.AsyncTranformer, context.AsyncFailurePredicates) {
Context = pc;
}
protected override async Task<(bool, T?)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
var page = await Context.Browser.NewPageAsync();
try {
var content = await page.ContentAsync();
await page.CloseAsync();
HtmlDocument doc = new();
doc.LoadHtml(content);
var transformed = await Transformer(doc);
if (FailurePredicates is null || !(await IsFailure(doc)))
return (true, transformed);
return (false, default);
} catch (Exception) {
return (false, default);
} finally {
if (!page.IsClosed)
await page.CloseAsync();
}
}
}
}
+13 -15
View File
@@ -5,9 +5,22 @@
<TargetFramework>net9.0</TargetFramework> <TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings> <ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable> <Nullable>enable</Nullable>
<Title>Beam.Temporary.Cli</Title>
<Authors>aeqw89</Authors>
<Company>qwsdcvghyu</Company>
<Description>A temporary CLI for Beam providing several useful mechanisms</Description>
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
<PackageId>aeqw89.Beam.Temporary.Cli</PackageId>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<PackageReference Include="aeqw89.Beam.Exports" Version="1.0.0" />
<PackageReference Include="aeqw89.DataKeys" Version="1.0.1" />
<PackageReference Include="aeqw89.PersistentData" Version="1.0.0" />
<PackageReference Include="aeqw89.Beam" Version="1.0.0" />
<PackageReference Include="aeqw89.Beam.Dynamic" Version="1.0.0" />
<PackageReference Include="Microsoft.Extensions.Logging" Version="9.0.1" /> <PackageReference Include="Microsoft.Extensions.Logging" Version="9.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.1" /> <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="9.0.1" /> <PackageReference Include="Microsoft.Extensions.Logging.Console" Version="9.0.1" />
@@ -16,19 +29,4 @@
<PackageReference Include="System.Linq.Async" Version="6.0.1" /> <PackageReference Include="System.Linq.Async" Version="6.0.1" />
</ItemGroup> </ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Beam.Dynamic\Beam.Dynamic.csproj" />
<ProjectReference Include="..\Beam.Exports\Beam.Exports.csproj" />
<ProjectReference Include="..\Beam\Beam.csproj" />
</ItemGroup>
<ItemGroup>
<Reference Include="aeqw89.DataKeys">
<HintPath>..\..\aeqw89.DataKeys\aeqw89.DataKeys\bin\Debug\net9.0\aeqw89.DataKeys.dll</HintPath>
</Reference>
<Reference Include="aeqw89.PersistentData">
<HintPath>..\..\aeqw89.PersistentData\aeqw89.PersistentData\bin\Release\net9.0\aeqw89.PersistentData.dll</HintPath>
</Reference>
</ItemGroup>
</Project> </Project>
+37 -14
View File
@@ -1,5 +1,6 @@
using aeqw89.DataKeys; using aeqw89.DataKeys;
using Beam.Dynamic; using Beam.Dynamic;
using Beam;
using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging;
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
@@ -24,17 +25,18 @@ namespace Beam.Temporary.Cli {
public interface ILinkStage { public interface ILinkStage {
ITransformStage WithLink(); ITransformStage WithLink();
ITransformStage WithLinkGenerator(); ITransformStage WithLinkGenerator();
ILinkStage WithRange(Range range);
} }
public interface ITransformStage { public interface ITransformStage {
IContextStage WithTransformer(Func<DataBindings, HtmlTransformer<T>> factory); IContextStage<U> WithTransformer<U>(Func<DataBindings, AsyncTransformer<T, U>> factory);
} }
public interface IContextStage { public interface IContextStage<U> {
IContextStage Configure(Action<DownloadContextBuilder<T>> configure); IContextStage<U> Configure(Action<DownloadContextBuilder<T>> configure);
IContextStage WithParallelism(int degree); IContextStage<U> WithParallelism(int degree);
IContextStage WithTimeout(TimeSpan timeout); IContextStage<U> WithTimeout(TimeSpan timeout);
IContextStage WithRetryReporter(IProgress<RetryReport> reporter); IContextStage<U> WithRetryReporter(IProgress<RetryReport> reporter);
DownloadEnumerable<T> Build(); DownloadEnumerable<T> Build();
} }
@@ -61,7 +63,7 @@ namespace Beam.Temporary.Cli {
initial = textRecord.Resource.MetaTemplateInitialData ?? throw new InvalidOperationException("Meta template data missing."); initial = textRecord.Resource.MetaTemplateInitialData ?? throw new InvalidOperationException("Meta template data missing.");
} else { } else {
source = textRecord.AssociatedSource ?? throw new InvalidOperationException($"Text source missing for '{novelKey}'."); source = textRecord.AssociatedSource ?? throw new InvalidOperationException($"Text source missing for '{novelKey}'.");
initial = textRecord.Resource.TemplateInitialData ?? throw new InvalidOperationException("Template initial data missing."); initial = textRecord.Resource.TemplateInitialData;
} }
return (source, initial); return (source, initial);
@@ -74,6 +76,10 @@ namespace Beam.Temporary.Cli {
State Initial, State Initial,
BeamDataDictionary Data, BeamDataDictionary Data,
DownloadContextBuilder<T> CtxBuilder) : ILinkStage { DownloadContextBuilder<T> CtxBuilder) : ILinkStage {
private State? endState;
private bool linksFrozen = false;
public ITransformStage WithLink() { public ITransformStage WithLink() {
var link = Data.Templates[Source.Key].Builder.Build(Initial); var link = Data.Templates[Source.Key].Builder.Build(Initial);
CtxBuilder.WithLinks(new[] { link }); CtxBuilder.WithLinks(new[] { link });
@@ -85,28 +91,45 @@ namespace Beam.Temporary.Cli {
var generator = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator( var generator = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator(
template.Builder, template.Builder,
new NumberedStateChanger(template.Factory.Behavior), new NumberedStateChanger(template.Factory.Behavior),
Initial)); Initial, endState));
CtxBuilder.WithLinks(generator); CtxBuilder.WithLinks(generator);
linksFrozen = true;
return new TransformStage(Source, Data, CtxBuilder); return new TransformStage(Source, Data, CtxBuilder);
} }
public ILinkStage WithRange(Range range) {
if (linksFrozen)
throw new InvalidOperationException($"WithRange must be called before WithLinkGenerator");
if (range.End.Value < range.Start.Value)
throw new ArgumentOutOfRangeException(nameof(range), $" start must be < end");
var template = Data.Templates[Source.Key];
var stateChanger = new NumberedStateChanger(template.Factory.Behavior);
endState = Initial.Copy();
stateChanger.Apply(Initial, range.Start.Value - 1);
stateChanger.Apply(endState, range.End.Value - 1);
return this;
}
} }
private sealed record TransformStage( private sealed record TransformStage(
WebResource Source, WebResource Source,
BeamDataDictionary Data, BeamDataDictionary Data,
DownloadContextBuilder<T> CtxBuilder) : ITransformStage { DownloadContextBuilder<T> CtxBuilder) : ITransformStage {
public IContextStage WithTransformer(Func<DataBindings, HtmlTransformer<T>> factory) { public IContextStage WithTransformer<U>(Func<DataBindings, Func<object, T>> factory) {
var transformer = factory(Data.Bindings[Source.Bindings]); var transformer = factory(Data.Bindings[Source.Bindings]);
CtxBuilder.WithTransformer(transformer); return new ContextStage<U>(CtxBuilder, transformer);
return new ContextStage(CtxBuilder);
} }
} }
private sealed class ContextStage : IContextStage { private sealed class ContextStage<U> : IContextStage {
private readonly DownloadContextBuilder<T> _ctxBuilder; private readonly DownloadContextBuilder<T> _ctxBuilder;
private readonly Func<object, T> _transformer;
private int _parallelism = 4; private int _parallelism = 4;
public ContextStage(DownloadContextBuilder<T> ctxBuilder) => _ctxBuilder = ctxBuilder; public ContextStage(DownloadContextBuilder<T> ctxBuilder, Func<object, T> transformer) {
_ctxBuilder = ctxBuilder;
_transformer = transformer;
}
public IContextStage Configure(Action<DownloadContextBuilder<T>> configure) { public IContextStage Configure(Action<DownloadContextBuilder<T>> configure) {
configure(_ctxBuilder); configure(_ctxBuilder);
@@ -134,7 +157,7 @@ namespace Beam.Temporary.Cli {
context, context,
ctx => new UnitFragmentDownloader<T>( ctx => new UnitFragmentDownloader<T>(
context.Web, context.Web,
context.AsyncTranformer, _transformer,
context.AsyncFailurePredicates, context.AsyncFailurePredicates,
_parallelism, _parallelism,
context.DownloadLogger), context.DownloadLogger),
+13
View File
@@ -0,0 +1,13 @@
using aeqw89.DataKeys;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
internal class LinkCollection(DataKey<string> key, List<SourceLink> links) {
public DataKey<string> Key { get; set; } = key;
public List<SourceLink> Links { get; set; } = links;
}
}
+2 -2
View File
@@ -50,8 +50,8 @@ namespace Beam.Temporary.Cli {
FriendlyName = "My House Of Horrors", FriendlyName = "My House Of Horrors",
AssociatedSource = wdsAgg, AssociatedSource = wdsAgg,
AssociatedMetaSource = wdsAux, AssociatedMetaSource = wdsAux,
TemplateInitialData = new State(["24349", "2896325"]), TemplateInitialData = new ImmutableState(["24349", "2896325"]),
MetaTemplateInitialData = new State(["24349"]) MetaTemplateInitialData = new ImmutableState(["24349"])
}; };
sdd.Novels.TryAdd(novel.Key, novel); sdd.Novels.TryAdd(novel.Key, novel);
+10 -18
View File
@@ -49,6 +49,8 @@ namespace Beam.Temporary.Cli {
NovelStatics.Define_WoDuShu_HouseOfHorrors(BeamData); NovelStatics.Define_WoDuShu_HouseOfHorrors(BeamData);
ClassicTemplates.Register(BeamData); ClassicTemplates.Register(BeamData);
await sharedContext.ForceSave();
CancellationTokenSource cts = new(); CancellationTokenSource cts = new();
HtmlTransformer<IDocumentMetaData> ArticleDataTransformer(DataBindings? binding) => (x) => { HtmlTransformer<IDocumentMetaData> ArticleDataTransformer(DataBindings? binding) => (x) => {
@@ -75,38 +77,23 @@ namespace Beam.Temporary.Cli {
}; };
var novel = new DataKey<TextResource>("novels:house_of_horrors"); var novel = new DataKey<TextResource>("novels:house_of_horrors");
var context_aux = Architecture.GetMeta(web, novel, BeamData, cts.Token);
context_aux.RetryReporter = new Progress<RetryReport>((x) => Console.WriteLine($"Failed. Trying again. {x.TryNumber}"));
var metaDownloader = new DownloadEnumerable<IDocumentMetaData>(
new SequentialFragmentDownloader<IDocumentMetaData>(
context_aux,
(c) => new UnitFragmentDownloader<IDocumentMetaData>(c.Web, c.AsyncTranformer, c.AsyncFailurePredicates, 4, logger),
logger)
.UnwrapFragmented());
var metadata = (await metaDownloader.FirstAsync());
var metadata2 = await DownloadBuilder<IDocumentMetaData>.FromMeta(novel, BeamData) var metadata2 = await DownloadBuilder<IDocumentMetaData>.FromMeta(novel, BeamData)
.WithLink() .WithLink()
.WithTransformer(ArticleDataTransformer) .WithTransformer(ArticleDataTransformer)
.Configure((x) => x .Configure((x) => x
.WithDownloadLogger(logger)
.WithRetryReporter(new Progress<RetryReport>()) .WithRetryReporter(new Progress<RetryReport>())
.WithTimeOut(TimeSpan.FromSeconds(15))) .WithTimeOut(TimeSpan.FromSeconds(15)))
.Build() .Build()
.FirstAsync(); .FirstAsync();
var context = Architecture.GetTextRecord(web, novel, BeamData, metadata.Data, cts.Token);
context.DownloadReporter = new Progress<DownloadReport>((x) => Console.WriteLine(x));
var downloader = new DownloadEnumerable<IDocument>(
new SequentialFragmentDownloader<IDocument>(
context,
(c) => new UnitFragmentDownloader<IDocument>(c.Web, c.AsyncTranformer, c.AsyncFailurePredicates, 4, logger),
logger)
.UnwrapFragmented());
var downloader2 = DownloadBuilder<IDocument>.FromText(novel, BeamData) var downloader2 = DownloadBuilder<IDocument>.FromText(novel, BeamData)
.WithRange(1..5)
.WithLinkGenerator() .WithLinkGenerator()
.WithTransformer((x) => DocumentTransformer(x, metadata2.Data)) .WithTransformer((x) => DocumentTransformer(x, metadata2.Data))
.Configure((x) => x .Configure((x) => x
.WithDownloadLogger(logger)
.WithDownloadReporter(new Progress<DownloadReport>((x) => logger.LogInformation(x.ToString()))) .WithDownloadReporter(new Progress<DownloadReport>((x) => logger.LogInformation(x.ToString())))
.WithTimeOut(TimeSpan.FromSeconds(15)) .WithTimeOut(TimeSpan.FromSeconds(15))
) )
@@ -122,11 +109,16 @@ namespace Beam.Temporary.Cli {
continue; continue;
if (meta is not ArticleData articleMetaData) if (meta is not ArticleData articleMetaData)
continue; continue;
if (!download.Data.MetaData.TryGetValue(Architecture.BookKey, out var bookmeta))
continue;
if (meta is not ArticleData bookMetaData)
continue;
//Console.WriteLine($"Title: {data.Name}"); //Console.WriteLine($"Title: {data.Name}");
//Console.WriteLine($"Description: {data.Description}"); //Console.WriteLine($"Description: {data.Description}");
//Console.WriteLine($"Categories: {data.Categories.Aggregate((x, y) => $"{x}; {y}")}"); //Console.WriteLine($"Categories: {data.Categories.Aggregate((x, y) => $"{x}; {y}")}");
//Console.WriteLine($"Authors: {data.Authors.Aggregate((x,y) => $"{x}; {y}")}"); //Console.WriteLine($"Authors: {data.Authors.Aggregate((x,y) => $"{x}; {y}")}");
Console.WriteLine($"Chapter title: {articleMetaData.Name}"); Console.WriteLine($"Chapter title: {articleMetaData.Name}");
Console.WriteLine($"Book title: {bookMetaData.Name}");
//Console.WriteLine($"Content: {download}"); //Console.WriteLine($"Content: {download}");
//translationTasks.Add(Task.Run(async () => { //translationTasks.Add(Task.Run(async () => {
@@ -7,6 +7,7 @@ using System.Threading.Tasks;
namespace Beam.Temporary.Cli { namespace Beam.Temporary.Cli {
public class StateChangerFactory { public class StateChangerFactory {
[JsonIgnore]
public IStateChangeBehaviour Behavior => FactoryTable[StateChangerKey](); public IStateChangeBehaviour Behavior => FactoryTable[StateChangerKey]();
[JsonInclude] [JsonInclude]
+2 -2
View File
@@ -13,8 +13,8 @@ namespace Beam.Temporary.Cli {
public string? FriendlyName { get; set; } public string? FriendlyName { get; set; }
public DataKey<WebResource>? AssociatedSource { get; set; } public DataKey<WebResource>? AssociatedSource { get; set; }
public DataKey<WebResource>? AssociatedMetaSource { get; set; } public DataKey<WebResource>? AssociatedMetaSource { get; set; }
public required State TemplateInitialData { get; set; } public required ImmutableState TemplateInitialData { get; set; }
public State? MetaTemplateInitialData { get; set; } public ImmutableState? MetaTemplateInitialData { get; set; }
public TextResourceRecord ToRecord(BeamDataDictionary sdd) { public TextResourceRecord ToRecord(BeamDataDictionary sdd) {
return new(this, return new(this,
+34
View File
@@ -0,0 +1,34 @@
using Beam.Dynamic;
using Microsoft.Recognizers.Text;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Beam.Tests {
public class OnlineCleanerTests {
[Fact]
public void Should_LocalizeArabic() {
const string test = "1234";
List<int> localized = OnlineCleaner.ParseNumbers(test, Culture.English);
Assert.Single(localized);
Assert.Equal(1234, localized[0]);
}
[Fact]
public void Should_LocalizeIndian() {
const string test = "九一五";
List<int> localized = OnlineCleaner.ParseNumbers(test, Culture.Chinese);
Assert.Single(localized);
Assert.Equal(915, localized[0]);
}
}
}
+6 -1
View File
@@ -1,7 +1,12 @@
namespace Beam.Tests { using System.Globalization;
namespace Beam.Tests {
public class UnitTest1 { public class UnitTest1 {
[Fact] [Fact]
public void Test1() { public void Test1() {
const string test = "九一五";
Assert.True(int.TryParse(test, CultureInfo.GetCultureInfo("zh-Hans").NumberFormat, out var number));
} }
} }
} }
+9
View File
@@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Exports", "Beam.Export
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Tests", "Beam.Tests\Beam.Tests.csproj", "{E26800C2-0518-49E8-88DF-A0B6ED97D4AB}" Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Tests", "Beam.Tests\Beam.Tests.csproj", "{E26800C2-0518-49E8-88DF-A0B6ED97D4AB}"
EndProject EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Beam.Puppeteer", "Beam.Puppeteer\Beam.Puppeteer.csproj", "{1A967563-D643-401D-A031-68DD43FACE8D}"
EndProject
Global Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU Debug|Any CPU = Debug|Any CPU
@@ -39,8 +41,15 @@ Global
{E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Debug|Any CPU.Build.0 = Debug|Any CPU {E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Release|Any CPU.ActiveCfg = Release|Any CPU {E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Release|Any CPU.Build.0 = Release|Any CPU {E26800C2-0518-49E8-88DF-A0B6ED97D4AB}.Release|Any CPU.Build.0 = Release|Any CPU
{1A967563-D643-401D-A031-68DD43FACE8D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1A967563-D643-401D-A031-68DD43FACE8D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1A967563-D643-401D-A031-68DD43FACE8D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1A967563-D643-401D-A031-68DD43FACE8D}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection EndGlobalSection
GlobalSection(SolutionProperties) = preSolution GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE HideSolutionNode = FALSE
EndGlobalSection EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {363CAF17-9E48-45B9-AA3F-78BB5E95DB0E}
EndGlobalSection
EndGlobal EndGlobal
+11 -7
View File
@@ -4,22 +4,26 @@
<TargetFramework>net9.0</TargetFramework> <TargetFramework>net9.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings> <ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable> <Nullable>enable</Nullable>
<GeneratePackageOnBuild>True</GeneratePackageOnBuild>
<Title>Beam</Title>
<Authors>aeqw89</Authors>
<Company>qwsdcvghyu</Company>
<Description>A library for downloading internet resources</Description>
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
<PackageId>aeqw89.Beam</PackageId>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<PackageReference Include="FluentBuilder" Version="0.10.0"> <PackageReference Include="aeqw89.DataKeys" Version="1.0.1" />
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="HtmlAgilityPack" Version="1.11.72" /> <PackageReference Include="HtmlAgilityPack" Version="1.11.72" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.1" /> <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.1" />
<PackageReference Include="System.Linq.Async" Version="6.0.1" /> <PackageReference Include="System.Linq.Async" Version="6.0.1" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Reference Include="aeqw89.DataKeys">
<HintPath>..\..\aeqw89.DataKeys\aeqw89.DataKeys\bin\Debug\net9.0\aeqw89.DataKeys.dll</HintPath>
</Reference>
</ItemGroup> </ItemGroup>
</Project> </Project>
+2 -15
View File
@@ -5,12 +5,13 @@ using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Text; using System.Text;
using System.Threading.Tasks; using System.Threading.Tasks;
using FluentBuilder;
using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging;
namespace Beam { namespace Beam {
public delegate T HtmlTransformer<out T>(HtmlDocument doc); public delegate T HtmlTransformer<out T>(HtmlDocument doc);
public delegate Task<U> AsyncTransformer<in T, U>(T elem);
public delegate Task<T> AsyncHtmlTransformer<T>(HtmlDocument doc); public delegate Task<T> AsyncHtmlTransformer<T>(HtmlDocument doc);
public delegate Task<T> AsyncBinaryTransformer<T>(byte[] bin);
public class DownloadContext<T> : IDisposable { public class DownloadContext<T> : IDisposable {
private bool disposedValue; private bool disposedValue;
@@ -19,8 +20,6 @@ namespace Beam {
=> DownloadContextBuilder<T>.FromContext(this); => DownloadContextBuilder<T>.FromContext(this);
public HtmlWeb Web { get; } public HtmlWeb Web { get; }
public HtmlTransformer<T> Transformer { get; }
public AsyncHtmlTransformer<T> AsyncTranformer { get; }
public IProgress<DownloadReport>? DownloadReporter { get; set; } public IProgress<DownloadReport>? DownloadReporter { get; set; }
public IProgress<RetryReport>? RetryReporter { get; set; } public IProgress<RetryReport>? RetryReporter { get; set; }
public AsyncDownloadFailurePredicate<HtmlDocument>?[]? AsyncFailurePredicates { get; } public AsyncDownloadFailurePredicate<HtmlDocument>?[]? AsyncFailurePredicates { get; }
@@ -33,8 +32,6 @@ namespace Beam {
public DownloadContext(HtmlWeb web, public DownloadContext(HtmlWeb web,
IEnumerable<SourceLink> links, IEnumerable<SourceLink> links,
CancellationToken cancellationToken = default, CancellationToken cancellationToken = default,
HtmlTransformer<T>? transformer = null,
AsyncHtmlTransformer<T>? asyncTransformer = null,
IProgress<DownloadReport>? downloadReporter = null, IProgress<DownloadReport>? downloadReporter = null,
IProgress<RetryReport>? retryReporter = null, IProgress<RetryReport>? retryReporter = null,
AsyncDownloadFailurePredicate<HtmlDocument>?[]? asyncFailurePredicates = null, AsyncDownloadFailurePredicate<HtmlDocument>?[]? asyncFailurePredicates = null,
@@ -46,16 +43,6 @@ namespace Beam {
Web = web; Web = web;
Links = links; Links = links;
CancellationToken = cancellationToken; CancellationToken = cancellationToken;
if (transformer is null && asyncTransformer is null)
throw new ArgumentException($"Either {nameof(transformer)} or {nameof(asyncTransformer)} must be not null.");
Transformer = transformer!;
AsyncTranformer = asyncTransformer!;
if (transformer is null && asyncTransformer is not null)
Transformer = (x) => asyncTransformer(x).Result;
if (asyncTransformer is null && transformer is not null)
AsyncTranformer = (x) => Task.FromResult(transformer(x));
DownloadReporter = downloadReporter; DownloadReporter = downloadReporter;
RetryReporter = retryReporter; RetryReporter = retryReporter;
AsyncFailurePredicates = asyncFailurePredicates; AsyncFailurePredicates = asyncFailurePredicates;
-16
View File
@@ -10,8 +10,6 @@ namespace Beam {
public class DownloadContextBuilder<T> { public class DownloadContextBuilder<T> {
private HtmlWeb _web; private HtmlWeb _web;
private HtmlTransformer<T> _transformer;
private AsyncHtmlTransformer<T> _asyncTransformer;
private IProgress<DownloadReport>? _downloadReporter; private IProgress<DownloadReport>? _downloadReporter;
private IProgress<RetryReport>? _retryReporter; private IProgress<RetryReport>? _retryReporter;
private AsyncDownloadFailurePredicate<HtmlDocument>?[]? _asyncFailurePredicates; private AsyncDownloadFailurePredicate<HtmlDocument>?[]? _asyncFailurePredicates;
@@ -34,16 +32,6 @@ namespace Beam {
return this; return this;
} }
public DownloadContextBuilder<T> WithTransformer(HtmlTransformer<T> transformer) {
_transformer = transformer;
return this;
}
public DownloadContextBuilder<T> WithAsyncTransformer(AsyncHtmlTransformer<T> asyncTransformer) {
_asyncTransformer = asyncTransformer;
return this;
}
public DownloadContextBuilder<T> WithDownloadReporter(IProgress<DownloadReport> downloadReporter) { public DownloadContextBuilder<T> WithDownloadReporter(IProgress<DownloadReport> downloadReporter) {
_downloadReporter = downloadReporter; _downloadReporter = downloadReporter;
return this; return this;
@@ -91,8 +79,6 @@ namespace Beam {
web: _web, web: _web,
links: _links, links: _links,
cancellationToken: _cancellationToken, cancellationToken: _cancellationToken,
transformer: _transformer,
asyncTransformer: _asyncTransformer,
downloadReporter: _downloadReporter, downloadReporter: _downloadReporter,
retryReporter: _retryReporter, retryReporter: _retryReporter,
asyncFailurePredicates: _asyncFailurePredicates, asyncFailurePredicates: _asyncFailurePredicates,
@@ -115,8 +101,6 @@ namespace Beam {
return new DownloadContextBuilder<T>(existing.Web) return new DownloadContextBuilder<T>(existing.Web)
.WithLinks(existing.Links) .WithLinks(existing.Links)
.WithCancellationToken(existing.CancellationToken) .WithCancellationToken(existing.CancellationToken)
.WithTransformer(existing.Transformer)
.WithAsyncTransformer(existing.AsyncTranformer)
.WithDownloadReporter(existing.DownloadReporter!) .WithDownloadReporter(existing.DownloadReporter!)
.WithRetryReporter(existing.RetryReporter!) .WithRetryReporter(existing.RetryReporter!)
.WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty<AsyncDownloadFailurePredicate<HtmlDocument>>()) .WithAsyncFailurePredicates(existing.AsyncFailurePredicates ?? Array.Empty<AsyncDownloadFailurePredicate<HtmlDocument>>())
+23
View File
@@ -0,0 +1,23 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Beam {
public readonly struct ImmutableState(object[] state) {
readonly object[] state = state;
public readonly Span<object> GetState() => state;
public readonly State Copy()
=> new((object[])state.Clone());
public readonly object this[Index i] {
get => state[i];
}
public static implicit operator State(ImmutableState state)
=> state.Copy();
}
}
+12 -3
View File
@@ -13,15 +13,17 @@ namespace Beam {
public class OrderedSourceLinkGenerator : IEnumerator<SourceLink> { public class OrderedSourceLinkGenerator : IEnumerator<SourceLink> {
public SourceLinkBuilder Builder { get; set; } public SourceLinkBuilder Builder { get; set; }
public NumberedStateChanger Behaviour { get; } public NumberedStateChanger Behaviour { get; }
private State InitialState;
private State? EndState;
private State InitialState;
public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, params object[] initialState) public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, params object[] initialState)
: this(builder, behaviour, new State(initialState)) { } : this(builder, behaviour, new State(initialState)) { }
public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, State initialState) { public OrderedSourceLinkGenerator(SourceLinkBuilder builder, NumberedStateChanger behaviour, State initialState, State? endState = null) {
Builder = builder; Builder = builder;
Behaviour = behaviour; Behaviour = behaviour;
InitialState = initialState; InitialState = initialState.Copy();
EndState = endState?.Copy();
State = InitialState.Copy(); State = InitialState.Copy();
Reset(); Reset();
@@ -37,8 +39,15 @@ namespace Beam {
} }
public bool MoveNext() { public bool MoveNext() {
if (!Current.HasValue || (EndState is not null && State.GetState().SequenceEqual(EndState.GetState()))) {
Current = SourceLink.InvalidLink;
return false;
}
Behaviour.Apply(State, 1); Behaviour.Apply(State, 1);
Current = Builder.Build(State); Current = Builder.Build(State);
if (!Current.HasValue || (EndState is not null && State.GetState().SequenceEqual(EndState.GetState()))) {
return false;
}
return Current.HasValue; return Current.HasValue;
} }
+4 -1
View File
@@ -35,13 +35,16 @@ namespace Beam {
links.Add(new Ordered<string>(LinksEnumerator.Current.Link.ToString(), LastOrder++)); links.Add(new Ordered<string>(LinksEnumerator.Current.Link.ToString(), LastOrder++));
while (LinksEnumerator.MoveNext() && links.Count < idealLinkCount) while (LinksEnumerator.MoveNext() && LinksEnumerator.Current != SourceLink.InvalidLink && links.Count < idealLinkCount)
links.Add(new Ordered<string>(LinksEnumerator.Current.Link.ToString(), LastOrder++)); links.Add(new Ordered<string>(LinksEnumerator.Current.Link.ToString(), LastOrder++));
if (links.Count == 0) { if (links.Count == 0) {
Logger?.LogInformation("Out of links!"); Logger?.LogInformation("Out of links!");
return false; return false;
} }
if (links.Any((x) => x.Data == SourceLink.InvalidLink.Link.ToString()))
return false;
var (result, downloadedT) = await unit.TryDownload( var (result, downloadedT) = await unit.TryDownload(
links.ToArray(), links.ToArray(),
Context.CancellationToken, Context.CancellationToken,
+73
View File
@@ -0,0 +1,73 @@
using System.Net.Http;
using System.Threading;
using System.Threading.Tasks;
namespace Beam {
/// <summary>
/// A download-managing class that retrieves binary data through <see cref="HttpClient"/>,
/// applies an <see cref="AsyncBinaryTransformer{T}"/>, and supports failure detection
/// plus exponential-back-off retries. Safe to instantiate per request.
/// </summary>
public class UnitDownloaderBinary<T>(
HttpClient client,
AsyncBinaryTransformer<T> transformer,
AsyncDownloadFailurePredicate<HttpResponseMessage>?[]? failurePredicates = null)
: IUnitDownloader<T> {
public HttpClient Client { get; } = client;
public virtual AsyncBinaryTransformer<T> Transformer { get; } = transformer;
public virtual AsyncDownloadFailurePredicate<HttpResponseMessage>?[]? FailurePredicates { get; } = failurePredicates;
public int LinksPerDownload { get; } = 1;
/// <summary>Runs all configured failure predicates in parallel on the raw HTTP response.</summary>
protected virtual async Task<bool> IsFailure(HttpResponseMessage response) {
if (FailurePredicates is null) return false;
var failed = false;
await Parallel.ForEachAsync(FailurePredicates, async (pred, _) => {
if (failed || pred is null) return;
if (await pred(response)) failed = true;
});
return failed;
}
/// <summary>One attempt without retries or back-off.</summary>
protected virtual async Task<(bool Success, T? Result)> TryDownloadWithNoRetries(string link, CancellationToken ct) {
try {
using var response = await Client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, ct);
if (!response.IsSuccessStatusCode) return (false, default);
if (await IsFailure(response)) return (false, default);
var bytes = await response.Content.ReadAsByteArrayAsync(ct);
return (true, await Transformer(bytes));
} catch {
return (false, default);
}
}
public async Task<(bool, T?)> TryDownload(
Ordered<string>[] link,
CancellationToken ct,
int maximumRetryCount = 7,
IProgress<RetryReport>? tryProgress = null) {
if (link.Length == 0) return (false, default);
T? result = default;
var attempt = 0;
while (attempt < maximumRetryCount) {
ct.ThrowIfCancellationRequested();
(var success, result) = await TryDownloadWithNoRetries(link[0].Data, ct);
if (success && result is not null) return (true, result);
++attempt;
tryProgress?.Report(new RetryReport(attempt, link[0].Data));
await Task.Delay((int)Math.Pow(2, attempt) * 1000, ct);
}
return (false, result);
}
}
}
+4 -3
View File
@@ -13,11 +13,12 @@ namespace Beam {
AsyncHtmlTransformer<T> transformer, AsyncHtmlTransformer<T> transformer,
AsyncDownloadFailurePredicate<HtmlDocument>?[]? failurePredicate = null, AsyncDownloadFailurePredicate<HtmlDocument>?[]? failurePredicate = null,
int fragmentSize = 4, int fragmentSize = 4,
ILogger? logger = null) { ILogger? logger = null,
IUnitDownloader<T>? internalDownloader = null) {
Web = web; Web = web;
Transformer = transformer; Transformer = transformer;
FailurePredicate = failurePredicate; FailurePredicate = failurePredicate;
UnitDownloader = new UnitDownloader<T>(Web, Transformer, FailurePredicate); UnitDownloader = internalDownloader ?? new UnitDownloader<T>(Web, Transformer, FailurePredicate);
LinksPerDownload = fragmentSize; LinksPerDownload = fragmentSize;
Logger = logger; Logger = logger;
} }
@@ -28,7 +29,7 @@ namespace Beam {
public int LinksPerDownload { get; set; } public int LinksPerDownload { get; set; }
public ILogger? Logger { get; set; } public ILogger? Logger { get; set; }
private readonly UnitDownloader<T> UnitDownloader; private readonly IUnitDownloader<T> UnitDownloader;
async Task<(bool, Fragment<Ordered<T>>?)> IUnitDownloader<Fragment<Ordered<T>>>.TryDownload(Ordered<string>[] link, CancellationToken ct, int maximumRetryCount, IProgress<RetryReport>? tryProgress) { async Task<(bool, Fragment<Ordered<T>>?)> IUnitDownloader<Fragment<Ordered<T>>>.TryDownload(Ordered<string>[] link, CancellationToken ct, int maximumRetryCount, IProgress<RetryReport>? tryProgress) {
Fragment<Ordered<T>> fragment = new Fragment<Ordered<T>>(link.Length); Fragment<Ordered<T>> fragment = new Fragment<Ordered<T>>(link.Length);