Added constant state changers to represent singular/repeating states. Added a DownloadContextBuilder to support fluent building patterns. Changed RetryReporter and DownloadReporter to use RetryReport and DownloadReport structs to simplify type declarations. Made MainArchitecture obsolete by supporting a fluent downloads with DownloadBuilder. Created a 'budge' OpenAI bridge for proof-of-concept translation.

This commit is contained in:
qwsdcvghyu89
2025-06-07 00:56:26 +03:00
parent a086cfa02b
commit a9a22ea23d
28 changed files with 809 additions and 145 deletions
+5 -5
View File
@@ -13,26 +13,26 @@ namespace Beam.Temporary.Cli {
/// A collection of specific useful methods and constants that facilitate the use of the application; allows other parts of the application to depend on architecture-specific arbitrary choices without compromising the Single-Responsibility principle or increasing redundant code.
/// </para>
/// </summary>
partial interface IArchitecture {
public partial interface IArchitecture {
/// <summary>
/// Gets the metadata associated with a <see cref="TextResource"/>
/// </summary>
/// <param name="web">The web client to use when downloading <see cref="WebResource"/>s</param>
/// <param name="pieceKey">The key of the <see cref="TextResource"/> stored in the <paramref name="sdd"/></param>
/// <param name="sdd">The <see cref="SharedDataDictionary"/> to be used to retrieve information</param>
/// <param name="sdd">The <see cref="BeamDataDictionary"/> to be used to retrieve information</param>
/// <param name="logger">Optional logger for logging debug information</param>
/// <returns>A <see cref="DownloadContext{T}"/> object with the required information to perform the download</returns>
public DownloadContext<IDocumentMetaData>? GetMeta(HtmlWeb web, DataKey<TextResource> pieceKey, SharedDataDictionary sdd, ILogger? logger = null);
public DownloadContext<IDocumentMetaData>? GetMeta(HtmlWeb web, DataKey<TextResource> pieceKey, BeamDataDictionary sdd, CancellationToken ct = default, ILogger? logger = null);
/// <summary>
/// Gets the <see cref="DownloadContext{T}"/> of the text record associated with <see cref="TextResource"/>
/// </summary>
/// <param name="web">The web client to use when downloading <see cref="WebResource"/>s</param>
/// <param name="pieceKey">The key of the <see cref="TextResource"/> stored in the <paramref name="sdd"/></param>
/// <param name="sdd">The <see cref="SharedDataDictionary"/> to be used to retrieve information</param>
/// <param name="sdd">The <see cref="BeamDataDictionary"/> to be used to retrieve information</param>
/// <param name="metadata">Optional book metadata to include with the final text record</param>
/// <param name="logger">Optional logger for logging debug information</param>
/// <returns>A <see cref="DownloadContext{T}"/> object with the required information to perform the download</returns>
public DownloadContext<IDocument>? GetTextRecord(HtmlWeb web, DataKey<TextResource> pieceKey, SharedDataDictionary sdd, IDocumentMetaData? metadata = null, ILogger? logger = null);
public DownloadContext<IDocument>? GetTextRecord(HtmlWeb web, DataKey<TextResource> pieceKey, BeamDataDictionary sdd, IDocumentMetaData? metadata = null, CancellationToken ct = default, ILogger? logger = null);
/// <summary>
/// The <see cref="DataKey{IDocumentMetaData}"/> to use when looking for the chapter metadata
@@ -11,6 +11,7 @@
<PackageReference Include="Microsoft.Extensions.Logging" Version="9.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="9.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="9.0.1" />
<PackageReference Include="OpenAI" Version="2.1.0" />
<PackageReference Include="Spectre.Console" Version="0.49.2-preview.0.70" />
<PackageReference Include="System.Linq.Async" Version="6.0.1" />
</ItemGroup>
@@ -9,9 +9,9 @@ using System.Text.Json;
using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
public class SharedDataDictionary : BaseDataDictionary {
public Dictionary<DataKey<WebResource>, PackagedSourceLinkGenerationData> Templates {
get => GetOrCreateDictionary<DataKey<WebResource>, PackagedSourceLinkGenerationData>(nameof(Templates));
public class BeamDataDictionary : BaseDataDictionary {
public Dictionary<DataKey<WebResource>, Template> Templates {
get => GetOrCreateDictionary<DataKey<WebResource>, Template>(nameof(Templates));
set => Data[nameof(Templates)] = value;
}
+34
View File
@@ -0,0 +1,34 @@
using aeqw89.DataKeys;
using Beam.Dynamic;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
public static class CommonTransformers {
public static HtmlTransformer<IDocumentMetaData> ArticleDataTransformer(DataBindings? binding) => (x) => {
return new ArticleData() {
Authors = [OnlineCleaner.Clean(binding?.Authors?.Resolve(x) ?? "")],
Name = OnlineCleaner.Clean(binding?.Title?.ResolveString(x) ?? ""),
Categories = OnlineCleaner.Clean(binding?.Tags?.ResolveString(x) ?? "").Split(';') ?? [],
Description = OnlineCleaner.Clean(binding?.Description?.ResolveString(x) ?? "")
};
};
public static HtmlTransformer<IDocument> DocumentTransformer(DataBindings? binding, IDocumentMetaData? metaData = null) => (x) => {
var resolved = binding?.Resolve(x);
var articleData = new ArticleData() {
Name = OnlineCleaner.Clean(resolved?.Title),
};
Dictionary<DataKey<IDocumentMetaData>, IDocumentMetaData> meta = [];
meta.Add(IArchitecture.Default.ChapterKey, articleData);
if (metaData is not null)
meta.Add(IArchitecture.Default.BookKey, metaData);
return new StringDocument(Path.GetRandomFileName(), OnlineCleaner.Clean(resolved?.Content)) {
MetaData = meta
};
};
}
}
+150
View File
@@ -0,0 +1,150 @@
using aeqw89.DataKeys;
using Beam.Dynamic;
using Microsoft.Extensions.Logging;
using System;
using System.Collections.Generic;
namespace Beam.Temporary.Cli {
/// <summary>
/// Typesafe, staged builder that prevents callers from forgetting the mandatory steps
/// (source → link selection → transformer) and surfaces operational knobs as firstclass
/// methods instead of magic parameters.
/// </summary>
public static class DownloadBuilder<T> {
/* ──────────────────────────── Entry points ─────────────────────────── */
public static ILinkStage FromMeta(DataKey<TextResource> novelKey, BeamDataDictionary data) =>
Create(novelKey, data, SourceKind.Meta);
public static ILinkStage FromText(DataKey<TextResource> novelKey, BeamDataDictionary data) =>
Create(novelKey, data, SourceKind.Text);
/* ────────────────────────────── Stages ─────────────────────────────── */
public interface ILinkStage {
ITransformStage WithLink();
ITransformStage WithLinkGenerator();
}
public interface ITransformStage {
IContextStage WithTransformer(Func<DataBindings, HtmlTransformer<T>> factory);
}
public interface IContextStage {
IContextStage Configure(Action<DownloadContextBuilder<T>> configure);
IContextStage WithParallelism(int degree);
IContextStage WithTimeout(TimeSpan timeout);
IContextStage WithRetryReporter(IProgress<RetryReport> reporter);
DownloadEnumerable<T> Build();
}
/* ────────────────────────── Implementation ────────────────────────── */
private enum SourceKind { Meta, Text }
private static ILinkStage Create(DataKey<TextResource> novelKey, BeamDataDictionary data, SourceKind kind) {
var (source, initial) = Resolve(novelKey, data, kind);
var ctxBuilder = new DownloadContextBuilder<T>().WithLinks(Array.Empty<SourceLink>()); // placeholder, filled later.
return new LinkStage(source, initial, data, ctxBuilder);
}
private static (WebResource Source, State Initial) Resolve(DataKey<TextResource> novelKey, BeamDataDictionary data, SourceKind kind) {
if (!data.Novels.TryGetValue(novelKey, out var tr))
throw new KeyNotFoundException($"Novel '{novelKey}' not found in BeamDataDictionary.");
var textRecord = tr.ToRecord(data);
WebResource? source;
State? initial;
if (kind == SourceKind.Meta) {
source = textRecord.AssociatedMetaSource ?? throw new InvalidOperationException($"Meta source missing for '{novelKey}'.");
initial = textRecord.Resource.MetaTemplateInitialData ?? throw new InvalidOperationException("Meta template data missing.");
} else {
source = textRecord.AssociatedSource ?? throw new InvalidOperationException($"Text source missing for '{novelKey}'.");
initial = textRecord.Resource.TemplateInitialData ?? throw new InvalidOperationException("Template initial data missing.");
}
return (source, initial);
}
/* ──────────────────────────── Stage types ─────────────────────────── */
private sealed record LinkStage(
WebResource Source,
State Initial,
BeamDataDictionary Data,
DownloadContextBuilder<T> CtxBuilder) : ILinkStage {
public ITransformStage WithLink() {
var link = Data.Templates[Source.Key].Builder.Build(Initial);
CtxBuilder.WithLinks(new[] { link });
return new TransformStage(Source, Data, CtxBuilder);
}
public ITransformStage WithLinkGenerator() {
var template = Data.Templates[Source.Key];
var generator = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator(
template.Builder,
new NumberedStateChanger(template.Factory.Behavior),
Initial));
CtxBuilder.WithLinks(generator);
return new TransformStage(Source, Data, CtxBuilder);
}
}
private sealed record TransformStage(
WebResource Source,
BeamDataDictionary Data,
DownloadContextBuilder<T> CtxBuilder) : ITransformStage {
public IContextStage WithTransformer(Func<DataBindings, HtmlTransformer<T>> factory) {
var transformer = factory(Data.Bindings[Source.Bindings]);
CtxBuilder.WithTransformer(transformer);
return new ContextStage(CtxBuilder);
}
}
private sealed class ContextStage : IContextStage {
private readonly DownloadContextBuilder<T> _ctxBuilder;
private int _parallelism = 4;
public ContextStage(DownloadContextBuilder<T> ctxBuilder) => _ctxBuilder = ctxBuilder;
public IContextStage Configure(Action<DownloadContextBuilder<T>> configure) {
configure(_ctxBuilder);
return this;
}
public IContextStage WithParallelism(int degree) {
_parallelism = Math.Max(1, degree);
return this;
}
public IContextStage WithTimeout(TimeSpan timeout) {
_ctxBuilder.WithTimeOut(timeout);
return this;
}
public IContextStage WithRetryReporter(IProgress<RetryReport> reporter) {
_ctxBuilder.WithRetryReporter(reporter);
return this;
}
public DownloadEnumerable<T> Build() {
var context = _ctxBuilder.Build();
SequentialFragmentDownloader<T> sequentialDownloader = new(
context,
ctx => new UnitFragmentDownloader<T>(
context.Web,
context.AsyncTranformer,
context.AsyncFailurePredicates,
_parallelism,
context.DownloadLogger),
context.DownloadLogger);
var enumerable = new DownloadEnumerable<T>(
sequentialDownloader
.UnwrapFragmented());
sequentialDownloader.DisposeAsync().AsTask().Wait();
return enumerable;
}
}
}
}
+19 -9
View File
@@ -4,14 +4,23 @@ using HtmlAgilityPack;
using Microsoft.Extensions.Logging;
namespace Beam.Temporary.Cli {
partial interface IArchitecture {
public partial interface IArchitecture {
private class MainArchitecture : IArchitecture {
public MainArchitecture() { }
public DataKey<IDocumentMetaData> ChapterKey { get; set; } = new("ma:chapter");
public DataKey<IDocumentMetaData> BookKey { get; set; } = new("ma:book");
public DownloadContext<IDocumentMetaData>? GetMeta(HtmlWeb web, DataKey<TextResource> pieceKey, SharedDataDictionary sdd, ILogger? logger = null) {
public HtmlTransformer<IDocumentMetaData> ArticleDataTransformer(DataBindings? binding) => (x) => {
return new ArticleData() {
Authors = [OnlineCleaner.Clean(binding?.Authors?.Resolve(x) ?? "")],
Name = OnlineCleaner.Clean(binding?.Title?.ResolveString(x) ?? ""),
Categories = OnlineCleaner.Clean(binding?.Tags?.ResolveString(x) ?? "").Split(';') ?? [],
Description = OnlineCleaner.Clean(binding?.Description?.ResolveString(x) ?? "")
};
};
public DownloadContext<IDocumentMetaData>? GetMeta(HtmlWeb web, DataKey<TextResource> pieceKey, BeamDataDictionary sdd, CancellationToken ct = default, ILogger? logger = null) {
var piece = sdd.Novels[pieceKey].ToRecord(sdd); // retrieves novel data from the sdd
var auxiliary = piece.AssociatedMetaSource?.ToRecord(sdd); // retrieves novel aux data from the sdd
@@ -22,7 +31,7 @@ namespace Beam.Temporary.Cli {
return null;
// gets the link for the novel's metadata using the auxillary data retrieved from the sdd
var link = sdd.Templates[auxiliary.Resource.Key].GenerateLink(piece?.Resource?.MetaTemplateInitialData!);
var link = sdd.Templates[auxiliary.Resource.Key].Builder.Build(piece?.Resource?.MetaTemplateInitialData);
var binding = auxiliary.Bindings;
return new DownloadContext<IDocumentMetaData>(web, [link], downloadLogger: logger, transformer: (x) => {
@@ -35,7 +44,7 @@ namespace Beam.Temporary.Cli {
});
}
public DownloadContext<IDocument>? GetTextRecord(HtmlWeb web, DataKey<TextResource> resKey, SharedDataDictionary sdd, IDocumentMetaData? metaData = null, ILogger? logger = null) {
public DownloadContext<IDocument>? GetTextRecord(HtmlWeb web, DataKey<TextResource> resKey, BeamDataDictionary sdd, IDocumentMetaData? metaData = null, CancellationToken ct = default, ILogger? logger = null) {
var res = sdd.Novels[resKey].ToRecord(sdd); // retrieves the novel data from the sdd
var aggregator = res.AssociatedSource?.ToRecord(sdd); // retrieves the aggregator (novel web source) from the sdd
@@ -47,8 +56,9 @@ namespace Beam.Temporary.Cli {
var template = sdd.Templates[aggregator.Resource.Key]; // gets the link generator for the specified aggregator
// creates a generative enumerable of type link from 'template'
var sle = SourceLinkEnumerable.FromGenerator(new DataBackedSourceLinkGenerator(
template, res.Resource.TemplateInitialData));
var sle = SourceLinkEnumerable.FromGenerator(new OrderedSourceLinkGenerator(
template.Builder, new NumberedStateChanger(template.Factory.Behavior),
res.Resource.TemplateInitialData));
return new DownloadContext<IDocument>(web, sle,
transformer: (x) => {
@@ -64,10 +74,10 @@ namespace Beam.Temporary.Cli {
MetaData = meta
};
},
retryReporter: new Progress<int>((x) => Console.WriteLine($"Retrying download ({x})")),
downloadReporter: new Progress<IDocument>((x) => Console.WriteLine($"Downloaded ({x.Filename})")),
retryReporter: new Progress<RetryReport>((x) => Console.WriteLine($"Retrying download of '{x.Link}' ({x.TryNumber}x)")),
//downloadReporter: new Progress<DownloadReport>((x) => Console.WriteLine($"Downloaded ({x})")),
asyncFailurePredicates: [
(x) => Task.FromResult(!x.DocumentNode.InnerHtml.Contains("<div id=\"chapter-container\" class=\"chapter-content\" itemprop=\"description\">"))
//(x) => Task.FromResult(!x.DocumentNode.InnerHtml.Contains("<div id=\"chapter-container\" class=\"chapter-content\" itemprop=\"description\">"))
],
timeOut: TimeSpan.FromSeconds(15),
downloadLogger: logger
+191 -92
View File
@@ -5,133 +5,131 @@ using Beam.Dynamic;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
internal static class NovelStatics {
public static void Define_LightNovelWorld_Novel_TheLegendaryMechanic(SharedDataDictionary sdd) {
var lnwAggregator = new DataKey<WebResource>("aeqw89:document:aggregators:light_novel_world");
var lnwAuxiliary = new DataKey<WebResource>("aeqw89:document:auxillaries:light_novel_world");
var novel = new TextResource() {
Key = new DataKey<TextResource>("novels:the_legendary_mechanic"),
AssociatedSource = lnwAggregator,
AssociatedMetaSource = lnwAuxiliary,
TemplateInitialData = ["the-legendary-mechanic-245", "1"],
MetaTemplateInitialData = ["the-legendary-mechanic"]
};
sdd.Novels.TryAdd(novel.Key, novel);
public static class NovelStatics {
//public static void Define_LightNovelWorld_Novel_TheLegendaryMechanic(SharedDataDictionary sdd) {
// var lnwAggregator = new DataKey<WebResource>("aeqw89:document:aggregators:light_novel_world");
// var lnwAuxiliary = new DataKey<WebResource>("aeqw89:document:auxillaries:light_novel_world");
// var novel = new TextResource() {
// Key = new DataKey<TextResource>("novels:the_legendary_mechanic"),
// AssociatedSource = lnwAggregator,
// AssociatedMetaSource = lnwAuxiliary,
// TemplateInitialData = ["the-legendary-mechanic-245", "1"],
// MetaTemplateInitialData = ["the-legendary-mechanic"]
// };
// sdd.Novels.TryAdd(novel.Key, novel);
sdd.AggregatorNovels.TryAdd(lnwAggregator, [novel.Key]);
}
// sdd.AggregatorNovels.TryAdd(lnwAggregator, [novel.Key]);
//}
public static void Define_LightNovelWorl_Novel_IAloneLevelUp(SharedDataDictionary sdd) {
var lnwAggregator = new DataKey("light_novel_world").ToAggregator().As<WebResource>();
var lnwAuxiliary = new DataKey("light_novel_world").ToAuxiliary().As<WebResource>();
//public static void Define_LightNovelWorl_Novel_IAloneLevelUp(SharedDataDictionary sdd) {
// var lnwAggregator = new DataKey("light_novel_world").ToAggregator().As<WebResource>();
// var lnwAuxiliary = new DataKey("light_novel_world").ToAuxiliary().As<WebResource>();
// var novel = new TextResource() {
// Key = new DataKey<TextResource>("novels:i_alone_level_up"),
// AssociatedSource = lnwAggregator,
// AssociatedMetaSource = lnwAuxiliary,
// TemplateInitialData = ["i-alone-level-up-236", "1"],
// MetaTemplateInitialData = ["i-alone-level-up-solo-leveling-05122225"]
// };
// sdd.Novels.TryAdd(novel.Key, novel);
// sdd.AggregatorNovels.TryAdd(lnwAggregator, [novel.Key]);
//}
public static void Define_WoDuShu_HouseOfHorrors(BeamDataDictionary sdd) {
var (wdsAgg, wdsAux) = CreateKeyPair<WebResource>("aggregators", "auxillaries", "wodushu", "aeqw89:document");
var novel = new TextResource() {
Key = new DataKey<TextResource>("novels:i_alone_level_up"),
AssociatedSource = lnwAggregator,
AssociatedMetaSource = lnwAuxiliary,
TemplateInitialData = ["i-alone-level-up-236", "1"],
MetaTemplateInitialData = ["i-alone-level-up-solo-leveling-05122225"]
Key = new DataKey<TextResource>("novels:house_of_horrors"),
FriendlyName = "My House Of Horrors",
AssociatedSource = wdsAgg,
AssociatedMetaSource = wdsAux,
TemplateInitialData = new State(["24349", "2896325"]),
MetaTemplateInitialData = new State(["24349"])
};
sdd.Novels.TryAdd(novel.Key, novel);
sdd.AggregatorNovels.TryAdd(lnwAggregator, [novel.Key]);
sdd.AggregatorNovels.TryAdd(wdsAgg, [novel.Key]);
}
public static void Define_NovelFull(SharedDataDictionary sdd) {
var docNamespace = "aeqw89:document";
var nfAgg = new DataKey<WebResource>("aggregators:novel_full").WithNamespace(docNamespace);
var nfAux = new DataKey<WebResource>("auxillaries:novel_full").WithNamespace(docNamespace);
var nfBindings = new DataKey<DataBindings>("aeqw89:bindings:light_novel_world");
var aggregator = new WebResource(nfAgg) {
Name = "Novel Full",
Description = "A novel aggregator site",
Domain = "https://novelfull.net",
Bindings = nfBindings
};
var auxiliary = new WebResource(nfAux) {
Name = "Novel Full",
Description = "A novel aggregator site",
Domain = "https://novelfull.net",
Bindings = nfBindings.WithSuffix("_aux")
};
sdd.Templates.TryAdd(nfAgg, new() {
Template = ""
});
private static (DataKey<T>, DataKey<T>) CreateKeyPair<T>(string pref1, string pref2, string common, string @namespace) {
return (
new DataKey<T>(pref1 + ":" + common).WithNamespace(@namespace),
new DataKey<T>(pref2 + ":" + common).WithNamespace(@namespace)
);
}
public static void Define_LightNovelWorld(SharedDataDictionary sdd) {
var lnwAggregator = new DataKey<WebResource>("aeqw89:document:aggregators:light_novel_world");
var lnwAuxiliary = new DataKey<WebResource>("aeqw89:document:auxillaries:light_novel_world");
const string lnwBindingsA = "aeqw89:bindings:light_novel_world";
var aggregator = new WebResource(lnwAggregator) {
Name = "Light Novel World",
Description = "A novel aggregator site maintained by NetherClaw",
Domain = "https://www.lightnovelworld.co",
Bindings = new DataKey<DataBindings>(lnwBindingsA)
public static void Define_WoDuShu(BeamDataDictionary sdd) {
var (wdsAgg, wdsAux) = CreateKeyPair<WebResource>("aggregators", "auxillaries", "wodushu", "aeqw89:document");
var bindings = new DataKey<DataBindings>("aeqw89:bindings:wodushu");
var aggregator = new WebResource(wdsAgg) {
Name = "WoDuShu.com",
Description = "A Chinese novel aggregator site",
Domain = "https://wodushu.com",
Bindings = bindings
};
const string lnwBindingsB = "aeqw89:bindings:light_novel_world_aux";
var auxiliary = new WebResource(lnwAuxiliary) {
Name = "Light Novel World",
Description = "A novel aggregator site maintained by NetherClaw",
Domain = "https://www.lightnovelworld.co",
Bindings = new DataKey<DataBindings>(lnwBindingsB)
var auxiliary = new WebResource(wdsAux) {
Name = "WoDuShu.com",
Description = "A Chinese novel aggregator site",
Domain = "https://wodushu.com",
Bindings = bindings.WithSuffix("_aux")
};
sdd.Templates.TryAdd(lnwAuxiliary, new() {
Template = "https://www.lightnovelworld.co/novel/{0}",
IndexOfChapterIndex = -1
sdd.Templates.TryAdd(wdsAgg, new() {
Factory = new(StateChangerFactory.LastAsNumber),
Builder = new SourceLinkBuilder("www.wodushu.com")
.WithSegments("read", "", "")
.WithParameters(1, "")
.WithParameters(2, (".html", Position.After))
});
sdd.Templates.TryAdd(lnwAggregator, new() {
Template = "https://www.lightnovelworld.co/novel/{0}/chapter-{1}",
IndexOfChapterIndex = 1
sdd.Templates.TryAdd(wdsAux, new() {
Factory = new(StateChangerFactory.Constant),
Builder = new SourceLinkBuilder("www.wodushu.com")
.WithSegments("book", "")
.WithParameters(1, "")
});
sdd.Aggregators.TryAdd(aggregator.Key, aggregator);
sdd.Auxillaries.TryAdd(auxiliary.Key, auxiliary);
sdd.Aggregators.TryAdd(wdsAgg, aggregator);
sdd.Auxillaries.TryAdd(wdsAux, auxiliary);
var lnwBindings = new DataKey<DataBindings>(lnwBindingsA);
var lnwBindingsAux = new DataKey<DataBindings>(lnwBindingsB);
sdd.Bindings.TryAdd(lnwBindings, new DataBindings() {
Title = new Binding("aeqw89:binding:light_novel_world:title") {
XPath = "/html/body/main/article/section/div[1]/h1/span[2]",
var binding_agg = new DataKey<DataBindings>("aeqw89:bindings:wodushu");
var binding_aux = new DataKey<DataBindings>("aeqw89:bindings:wodushu_aux");
sdd.Bindings.Add(binding_agg, new() {
Title = new Binding() {
XPath = "/html/body/div[4]/div/div/div[2]/h1",
Type = BindingType.Single
},
Content = new("aeqw89:binding:light_novel_world:content") {
Content = new Binding() {
Type = BindingType.UseProvider,
Provider = new ParagraphedContentDataProvider() {
Content = new Binding() {
XPath = "//*[@id=\"chapter-container\"]"
XPath = "//*[@id=\"content\"]"
}
},
Type = BindingType.UseProvider
}
},
});
sdd.Bindings.TryAdd(lnwBindingsAux, new DataBindings() {
Title = new("aeqw89:binding:light_novel_world_aux:title") {
XPath = "/html/body/main/article/header/div[2]/div[2]/div[1]/h1",
sdd.Bindings.Add(binding_aux, new() {
Title = new Binding() {
XPath = "/html/body/div[3]/div[1]/div/div/div[2]/div[1]/h1",
Type = BindingType.Single
},
Authors = new("aeqw89:binding:light_novel_world_aux:authors") {
XPath = "/html/body/main/article/header/div[2]/div[2]/div[1]/div[1]/a",
Authors = new Binding() {
XPath = "/html/body/div[3]/div[1]/div/div/div[2]/div[1]/div/p[1]/a",
Type = BindingType.Single
},
Description = new("aeqw89:binding:light_novel_world_aux:description") {
Description = new Binding() {
Provider = new ParagraphedContentDataProvider() {
Content = new() {
XPath = "/html/body/main/article/div/section/div[1]/div"
}
},
Type = BindingType.UseProvider
},
Tags = new("aeqw89:binding:light_novel_world_aux:tags") {
Provider = new ListContentDataProvider() {
Content = new() {
XPath = "/html/body/main/article/header/div[2]/div[2]/div[3]/ul"
Content = new Binding() {
XPath = "/html/body/div[3]/div[1]/div/div/div[2]/div[2]"
}
},
Type = BindingType.UseProvider
@@ -139,6 +137,107 @@ namespace Beam.Temporary.Cli {
});
}
//public static void Define_NovelFull(SharedDataDictionary sdd) {
// var docNamespace = "aeqw89:document";
// var nfAgg = new DataKey<WebResource>("aggregators:novel_full").WithNamespace(docNamespace);
// var nfAux = new DataKey<WebResource>("auxillaries:novel_full").WithNamespace(docNamespace);
// var nfBindings = new DataKey<DataBindings>("aeqw89:bindings:light_novel_world");
// var aggregator = new WebResource(nfAgg) {
// Name = "Novel Full",
// Description = "A novel aggregator site",
// Domain = "https://novelfull.net",
// Bindings = nfBindings
// };
// var auxiliary = new WebResource(nfAux) {
// Name = "Novel Full",
// Description = "A novel aggregator site",
// Domain = "https://novelfull.net",
// Bindings = nfBindings.WithSuffix("_aux")
// };
// sdd.Templates.TryAdd(nfAux, new(StateChangerFactory.LastAsNumberPrefixed));
// sdd.Aggregators.TryAdd(nfAgg, aggregator);
// sdd.Auxillaries.TryAdd(nfAux, auxiliary);
// var binding_agg = new DataKey<DataBindings>("aeqw89:bindings:be")
//}
//public static void Define_LightNovelWorld(SharedDataDictionary sdd) {
// var lnwAggregator = new DataKey<WebResource>("aeqw89:document:aggregators:light_novel_world");
// var lnwAuxiliary = new DataKey<WebResource>("aeqw89:document:auxillaries:light_novel_world");
// const string lnwBindingsA = "aeqw89:bindings:light_novel_world";
// var aggregator = new WebResource(lnwAggregator) {
// Name = "Light Novel World",
// Description = "A novel aggregator site maintained by NetherClaw",
// Domain = "https://www.lightnovelworld.co",
// Bindings = new DataKey<DataBindings>(lnwBindingsA)
// };
// const string lnwBindingsB = "aeqw89:bindings:light_novel_world_aux";
// var auxiliary = new WebResource(lnwAuxiliary) {
// Name = "Light Novel World",
// Description = "A novel aggregator site maintained by NetherClaw",
// Domain = "https://www.lightnovelworld.co",
// Bindings = new DataKey<DataBindings>(lnwBindingsB)
// };
// sdd.Templates.TryAdd(lnwAuxiliary, new() {
// Template = "https://www.lightnovelworld.co/novel/{0}",
// IndexOfChapterIndex = -1
// });
// sdd.Templates.TryAdd(lnwAggregator, new() {
// Template = "https://www.lightnovelworld.co/novel/{0}/chapter-{1}",
// IndexOfChapterIndex = 1
// });
// sdd.Aggregators.TryAdd(aggregator.Key, aggregator);
// sdd.Auxillaries.TryAdd(auxiliary.Key, auxiliary);
// var lnwBindings = new DataKey<DataBindings>(lnwBindingsA);
// var lnwBindingsAux = new DataKey<DataBindings>(lnwBindingsB);
// sdd.Bindings.TryAdd(lnwBindings, new DataBindings() {
// Title = new Binding("aeqw89:binding:light_novel_world:title") {
// XPath = "/html/body/main/article/section/div[1]/h1/span[2]",
// Type = BindingType.Single
// },
// Content = new("aeqw89:binding:light_novel_world:content") {
// Provider = new ParagraphedContentDataProvider() {
// Content = new Binding() {
// XPath = "//*[@id=\"chapter-container\"]"
// }
// },
// Type = BindingType.UseProvider
// },
// });
// sdd.Bindings.TryAdd(lnwBindingsAux, new DataBindings() {
// Title = new("aeqw89:binding:light_novel_world_aux:title") {
// XPath = "/html/body/main/article/header/div[2]/div[2]/div[1]/h1",
// Type = BindingType.Single
// },
// Authors = new("aeqw89:binding:light_novel_world_aux:authors") {
// XPath = "/html/body/main/article/header/div[2]/div[2]/div[1]/div[1]/a",
// Type = BindingType.Single
// },
// Description = new("aeqw89:binding:light_novel_world_aux:description") {
// Provider = new ParagraphedContentDataProvider() {
// Content = new() {
// XPath = "/html/body/main/article/div/section/div[1]/div"
// }
// },
// Type = BindingType.UseProvider
// },
// Tags = new("aeqw89:binding:light_novel_world_aux:tags") {
// Provider = new ListContentDataProvider() {
// Content = new() {
// XPath = "/html/body/main/article/header/div[2]/div[2]/div[3]/ul"
// }
// },
// Type = BindingType.UseProvider
// }
// });
//}
}
}
+72 -19
View File
@@ -15,11 +15,11 @@ namespace Beam.Temporary.Cli {
public static JsonSerializerOptions ConversionOptions { get; internal set; } = new();
public static SharedDataDictionary Shared { get; set; } = [];
public static BeamDataDictionary BeamData { get; set; } = [];
public static IArchitecture Architecture = IArchitecture.Default;
const string SharedDataPath = "data/.dat";
const string BeamDataPath = "data/.dat";
static async Task Main(string[] args) {
ConversionOptions.Converters.AddPersistentDataRequiredConverters();
@@ -34,24 +34,49 @@ namespace Beam.Temporary.Cli {
ILogger logger = lf
.CreateLogger("Program");
await using var sharedContext = await DataDictionaryContext<SharedDataDictionary>.Create(
SharedDataPath,
await using var sharedContext = await DataDictionaryContext<BeamDataDictionary>.Create(
BeamDataPath,
false,
DataKind.Shared,
logger,
ConversionOptions
);
Shared = sharedContext.Data;
BeamData = sharedContext.Data;
Shared.Clear();
NovelStatics.Define_LightNovelWorld(Shared);
NovelStatics.Define_LightNovelWorld_Novel_TheLegendaryMechanic(Shared);
NovelStatics.Define_LightNovelWorl_Novel_IAloneLevelUp(Shared);
ClassicTemplates.Register(Shared);
BeamData.Clear();
NovelStatics.Define_WoDuShu(BeamData);
NovelStatics.Define_WoDuShu_HouseOfHorrors(BeamData);
ClassicTemplates.Register(BeamData);
var novel = new DataKey<TextResource>("novels:i_alone_level_up");
var context_aux = Architecture.GetMeta(web, novel, Shared);
CancellationTokenSource cts = new();
HtmlTransformer<IDocumentMetaData> ArticleDataTransformer(DataBindings? binding) => (x) => {
return new ArticleData() {
Authors = [OnlineCleaner.Clean(binding?.Authors?.Resolve(x) ?? "")],
Name = OnlineCleaner.Clean(binding?.Title?.ResolveString(x) ?? ""),
Categories = OnlineCleaner.Clean(binding?.Tags?.ResolveString(x) ?? "").Split(';') ?? [],
Description = OnlineCleaner.Clean(binding?.Description?.ResolveString(x) ?? "")
};
};
HtmlTransformer<IDocument> DocumentTransformer(DataBindings? binding, IDocumentMetaData? metaData = null) => (x) => {
var resolved = binding?.Resolve(x);
var articleData = new ArticleData() {
Name = OnlineCleaner.Clean(resolved?.Title),
};
Dictionary<DataKey<IDocumentMetaData>, IDocumentMetaData> meta = [];
meta.Add(IArchitecture.Default.ChapterKey, articleData);
if (metaData is not null)
meta.Add(IArchitecture.Default.BookKey, metaData);
return new StringDocument(Path.GetRandomFileName(), OnlineCleaner.Clean(resolved?.Content)) {
MetaData = meta
};
};
var novel = new DataKey<TextResource>("novels:house_of_horrors");
var context_aux = Architecture.GetMeta(web, novel, BeamData, cts.Token);
context_aux.RetryReporter = new Progress<RetryReport>((x) => Console.WriteLine($"Failed. Trying again. {x.TryNumber}"));
var metaDownloader = new DownloadEnumerable<IDocumentMetaData>(
new SequentialFragmentDownloader<IDocumentMetaData>(
context_aux,
@@ -60,8 +85,17 @@ namespace Beam.Temporary.Cli {
.UnwrapFragmented());
var metadata = (await metaDownloader.FirstAsync());
var context = Architecture.GetTextRecord(web, novel, Shared, metadata.Data);
context.DownloadReporter = new Progress<IDocument>((x) => Console.WriteLine(x.Filename));
var metadata2 = await DownloadBuilder<IDocumentMetaData>.FromMeta(novel, BeamData)
.WithLink()
.WithTransformer(ArticleDataTransformer)
.Configure((x) => x
.WithRetryReporter(new Progress<RetryReport>())
.WithTimeOut(TimeSpan.FromSeconds(15)))
.Build()
.FirstAsync();
var context = Architecture.GetTextRecord(web, novel, BeamData, metadata.Data, cts.Token);
context.DownloadReporter = new Progress<DownloadReport>((x) => Console.WriteLine(x));
var downloader = new DownloadEnumerable<IDocument>(
new SequentialFragmentDownloader<IDocument>(
context,
@@ -69,9 +103,21 @@ namespace Beam.Temporary.Cli {
logger)
.UnwrapFragmented());
List<Ordered<IDocument>> documents = [];
var downloader2 = DownloadBuilder<IDocument>.FromText(novel, BeamData)
.WithLinkGenerator()
.WithTransformer((x) => DocumentTransformer(x, metadata2.Data))
.Configure((x) => x
.WithDownloadReporter(new Progress<DownloadReport>((x) => logger.LogInformation(x.ToString())))
.WithTimeOut(TimeSpan.FromSeconds(15))
)
.Build();
await foreach (var download in downloader.Take(20)) {
List<Task<Ordered<IDocument>>> translationTasks = [];
List<Ordered<IDocument>> documents = [];
await foreach (var download in downloader2.Take(10)) {
if (!download.Data.MetaData.TryGetValue(Architecture.ChapterKey, out var meta))
continue;
if (meta is not ArticleData articleMetaData)
@@ -83,8 +129,15 @@ namespace Beam.Temporary.Cli {
Console.WriteLine($"Chapter title: {articleMetaData.Name}");
//Console.WriteLine($"Content: {download}");
documents.Add(download);
}
//translationTasks.Add(Task.Run(async () => {
// logger.LogInformation("Beginning translation {} task for {}", download.Order, articleMetaData.Name);
// var ret = new Ordered<IDocument>(await QuickAndDirtyJanitor.TranslateAsync(download.Data), download.Order);
// logger.LogInformation("Finished translation {} task for {}", download.Order, articleMetaData.Name);
// return ret;
//}));
}
documents = (await Task.WhenAll(translationTasks)).ToList();
string testDir = Path.Combine("txt", Path.GetRandomFileName());
Directory.CreateDirectory(testDir);
@@ -113,7 +166,7 @@ namespace Beam.Temporary.Cli {
// HtmlBook.Keys.TitlePage,
// HtmlBook.Keys.StylesPage,
//}.Select(
// (x) => Shared.Files.ReadToString(x.WithNamespace("aeqw89:files:templates:classic"))
// (x) => BeamData.Files.ReadToString(x.WithNamespace("aeqw89:files:templates:classic"))
//).ToArray();
//HtmlBook book = new(
@@ -0,0 +1,10 @@
{
"profiles": {
"Beam.Temporary.Cli": {
"commandName": "Project",
"environmentVariables": {
"OPEN_AI_KEY": "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"
}
}
}
}
@@ -0,0 +1,25 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using OpenAI;
using OpenAI.Chat;
namespace Beam.Temporary.Cli {
public class QuickAndDirtyJanitor {
static OpenAIClient client;
static QuickAndDirtyJanitor() {
var key = Environment.GetEnvironmentVariable("OPEN_AI_KEY");
client = new OpenAIClient(key);
}
public static async Task<IDocument> TranslateAsync(IDocument document) {
var chatCompletion = await client.GetChatClient("gpt-4.1").CompleteChatAsync(
ChatMessage.CreateSystemMessage("Translate the following text into english. If any part of the text has no direct English translation, you may choose to leave it as is. In either case, make sure to leave footnotes for any difficult to translate words. You must translate the whole text and output only your translation and footnotes. No other comments are necessary."),
ChatMessage.CreateUserMessage("From UNKNOWN to ENGLISH.\n" + document.ToString()));
return new StringDocument(document.Filename, chatCompletion.Value.Content.DefaultIfEmpty().Select((x) => x?.Text).Aggregate((x,y) => $"{x}{y}"));
}
}
}
+33
View File
@@ -0,0 +1,33 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.Json.Serialization;
using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
public class StateChangerFactory {
public IStateChangeBehaviour Behavior => FactoryTable[StateChangerKey]();
[JsonInclude]
public string StateChangerKey { get; set; }
[JsonConstructor]
public StateChangerFactory(string key) {
if (!Keys.Contains(key))
throw new ArgumentException($"{key} not in keys list", nameof(key));
StateChangerKey = key;
}
public static Dictionary<string, Func<IStateChangeBehaviour>> FactoryTable = new() {
{ LastAsNumber, () => CommonStateChangers.LastAsNumber },
{ LastAsNumberPrefixed, () => CommonStateChangers.NthAsNumber(^1, true) },
{ Constant, () => CommonStateChangers.Constant },
};
public HashSet<string> Keys = [LastAsNumber, LastAsNumberPrefixed, Constant];
public const string LastAsNumber = "LastAsNumber";
public const string LastAsNumberPrefixed = "LastAsNumberPrefixed";
public const string Constant = "Constant";
}
}
+12
View File
@@ -0,0 +1,12 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
public class Template {
public StateChangerFactory Factory { get; set; }
public SourceLinkBuilder Builder { get; set; }
}
}
@@ -6,7 +6,7 @@ using System.Threading.Tasks;
namespace Beam.Temporary.Cli.Templates.Classic {
internal class ClassicTemplates {
public static void Register(SharedDataDictionary sdd) {
public static void Register(BeamDataDictionary sdd) {
sdd.Files.TryAdd(
new("aeqw89:files:templates:classic:content_page"),
new("C:\\Users\\qwsdc\\source\\repos\\Beam\\Beam.Temporary.Cli\\Templates\\Classic\\Content.template.html", "htmlpage", "templates"));
+4 -3
View File
@@ -10,12 +10,13 @@ using System.Threading.Tasks;
namespace Beam.Temporary.Cli {
public class TextResource : IKeyed<TextResource> {
public required DataKey<TextResource> Key { get; set; }
public string? FriendlyName { get; set; }
public DataKey<WebResource>? AssociatedSource { get; set; }
public DataKey<WebResource>? AssociatedMetaSource { get; set; }
public required string[] TemplateInitialData { get; set; }
public string?[]? MetaTemplateInitialData { get; set; }
public required State TemplateInitialData { get; set; }
public State? MetaTemplateInitialData { get; set; }
public TextResourceRecord ToRecord(SharedDataDictionary sdd) {
public TextResourceRecord ToRecord(BeamDataDictionary sdd) {
return new(this,
AssociatedSource is null ? null : sdd.Aggregators[AssociatedSource],
AssociatedMetaSource is null ? null : sdd.Auxillaries[AssociatedMetaSource]);
+1 -1
View File
@@ -19,7 +19,7 @@ namespace Beam.Temporary.Cli {
public WebResource() : this(new(string.Empty)) { }
public WebResourceRecord ToRecord(SharedDataDictionary sdd) {
public WebResourceRecord ToRecord(BeamDataDictionary sdd) {
return new WebResourceRecord(this, sdd.Bindings[Bindings]);
}
}