feat: add deferred response buffering, TableDataProvider, and stealth improvements

- ApiResponse: add readToBuffer option to defer/stream body instead of eagerly buffering
- TableDataProvider: implement HTML table parser with per-column provider support
- StealthConfig: add 10s page load timeout and copyCookiesFrom parameter for cookie sharing
- StealthUnitDownloader: catch WebDriverTimeoutException on navigation, log warning instead of throwing
- Bump version to 2.9.0
This commit is contained in:
qwsdcvghyu89
2026-04-03 11:51:19 +11:00
parent cf75d4a5d5
commit 2965270928
9 changed files with 229 additions and 26 deletions
+8
View File
@@ -8,6 +8,10 @@
</component>
<component name="ChangeListManager">
<list default="true" id="a7e2f92c-8039-47f7-8b93-1c7d5c5d92cc" name="Changes" comment="">
<change afterPath="$PROJECT_DIR$/Beam.Dynamic/DataProviders/TableDataProvider.cs" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/.idea.Beam/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/.idea.Beam/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Beam.Stealth/StealthConfig.cs" beforeDir="false" afterPath="$PROJECT_DIR$/Beam.Stealth/StealthConfig.cs" afterDir="false" />
<change beforePath="$PROJECT_DIR$/Beam.Stealth/StealthUnitDownloader.cs" beforeDir="false" afterPath="$PROJECT_DIR$/Beam.Stealth/StealthUnitDownloader.cs" afterDir="false" />
<change beforePath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj" beforeDir="false" afterPath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj" afterDir="false" />
<change beforePath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj.bak" beforeDir="false" afterPath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj.bak" afterDir="false" />
</list>
@@ -44,11 +48,13 @@
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/ManyComposeDataProviders.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/RelationalDataProvider.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/SelectDataProvider.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/TableDataProvider.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/AssertionException.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/Exceptions.Designer.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/Exceptions.resx" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/MapException.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Models/ResourceDefinition.cs" root0="FORCE_HIGHLIGHTING" />
<setting file="file://$PROJECT_DIR$/Beam.Stealth/StealthConfig.cs" root0="FORCE_HIGHLIGHTING" />
</component>
<component name="MetaFilesCheckinStateConfiguration" checkMetaFiles="true" />
<component name="ProblemsViewState">
@@ -121,6 +127,8 @@
<workItem from="1759845817258" duration="647000" />
<workItem from="1760684473926" duration="4000" />
<workItem from="1763031614537" duration="17900000" />
<workItem from="1763822547278" duration="421000" />
<workItem from="1763824632550" duration="6009000" />
</task>
<task id="LOCAL-00001" summary="refactor: modularize Beam into new projects and interfaces&#10;&#10;- Introduced modularity by splitting Beam into new projects: Beam.Abstractions, Beam.Models, and Beam.Downloaders.&#10;- Refactored existing classes into appropriate namespaces and projects.&#10;- Replaced specific implementations with abstractions (e.g., SourceLinkBuilder to LinkBuilder, State to IState, etc.).&#10;- Updated interfaces: added ITemplate, IArticleData, IDownloadReport, and others for improved extensibility.&#10;- Removed deprecated classes like SourceLinkBuilder and StateChangerFactory.&#10;- Enhanced link handling in downloaders by refactoring to use `string` over `SourceLink`. &#10;- Consolidated shared logic under Beam.Abstractions.">
<option name="closed" value="true" />
+3 -3
View File
@@ -22,7 +22,7 @@ namespace Beam.Api;
private string? ContentType = "application/json";
public async Task<ApiResponse> GetResponse(ILogger<ApiResponse>? logger, (int @try, int max)? tries = null, CancellationToken ct = default) {
public async Task<ApiResponse> GetResponse(ILogger<ApiResponse>? logger, (int @try, int max)? tries = null, bool readToBuffer = true, CancellationToken ct = default) {
SanitizeHeaders();
var request = new HttpRequestMessage(Method, Uri);
@@ -40,10 +40,10 @@ namespace Beam.Api;
if (tries is not null && tries?.@try < tries?.max && !SuccessCodes.Contains(response.StatusCode)) {
await Task.Delay((int)Math.Min(Math.Pow(2, tries.Value.@try), 60) * 1000, ct);
return await GetResponse(logger, (tries.Value.@try + 1, tries.Value.max), ct);
return await GetResponse(logger, (tries.Value.@try + 1, tries.Value.max), readToBuffer, ct);
}
return await ApiResponse.CreateAsync(response, logger, RequestData, ct);
return await ApiResponse.CreateAsync(response, logger, RequestData, readToBuffer, ct);
}
private void SanitizeHeaders() {
+2 -2
View File
@@ -33,7 +33,7 @@ namespace Beam.Api;
// sequential
var sequential = new List<ApiResponse>(_calls.Count);
foreach (var call in _calls)
sequential.Add(await call.GetResponse(logger, tries, ct));
sequential.Add(await call.GetResponse(logger, tries, true, ct));
return sequential;
}
@@ -43,7 +43,7 @@ namespace Beam.Api;
_calls.Select((c, i) => (call: c, idx: i)),
new ParallelOptions { MaxDegreeOfParallelism = _maxDegree, CancellationToken = ct },
async (item, token) => {
var response = await item.call.GetResponse(logger, tries, token);
var response = await item.call.GetResponse(logger, tries, true, token);
bag.Add((item.idx, response));
});
+36 -10
View File
@@ -14,11 +14,13 @@ namespace Beam.Api;
/// Wrapper that lets the response body be read any number of times (even concurrently).
/// </summary>
public sealed class ApiResponse {
private readonly byte[] _buffer;
private byte[] _buffer;
private bool _read_has_been_deferred;
private ApiResponse(HttpResponseMessage response, byte[] buffer, ILogger<ApiResponse>? logger, object? requestData = null) {
Response = response;
_buffer = buffer;
_read_has_been_deferred = _buffer.Length == 0;
Logger = logger;
RequestData = requestData;
}
@@ -33,8 +35,10 @@ namespace Beam.Api;
HttpResponseMessage response,
ILogger<ApiResponse>? logger = null,
object? requestData = null,
bool readToBuffer = true,
CancellationToken ct = default) {
if (response is null) throw new ArgumentNullException(nameof(response));
if (!readToBuffer) return new ApiResponse(response, [], logger, requestData);
var buffer = response.Content is null
? []
@@ -55,32 +59,54 @@ namespace Beam.Api;
if (!Is200) errorHandler(Response.StatusCode);
return this;
}
/* ---------- content helpers ---------- */
public Task<T?> AsSerializedObject<T>(CancellationToken ct = default) {
private async Task ReadToBuffer(CancellationToken ct = default) {
if (!_read_has_been_deferred) return;
_buffer = Response.Content is null
? []
: await Response.Content.ReadAsByteArrayAsync(ct).ConfigureAwait(false);
_read_has_been_deferred = false;
}
public async Task<T?> AsSerializedObject<T>(CancellationToken ct = default) {
if (!Is200) throw new InvalidOperationException();
if (Response.Content?.Headers.ContentType?.MediaType != "application/json")
Logger?.LogWarning("Content-Type is not JSON, yet JSON deserialization was requested.");
return Task.FromResult(JsonSerializer.Deserialize<T>(_buffer));
if (_read_has_been_deferred) {
return await JsonSerializer.DeserializeAsync<T>(await Response.Content!.ReadAsStreamAsync(ct), (JsonSerializerOptions?)null, ct);
} else {
return JsonSerializer.Deserialize<T>(_buffer);
}
}
public Task<T?> AsDynamicObject<T>(T _, CancellationToken ct = default)
=> AsSerializedObject<T>(ct);
public Task<string> AsString(CancellationToken ct = default) {
public async Task<string> AsString(CancellationToken ct = default) {
if (!Is200) Logger?.LogWarning("Non-success response; attempting to read content.");
return Task.FromResult(Encoding.UTF8.GetString(_buffer));
if (_read_has_been_deferred) {
await ReadToBuffer(ct);
}
return Encoding.UTF8.GetString(_buffer);
}
public Task<byte[]> AsBinary(CancellationToken ct = default) {
public async Task<byte[]> AsBinary(CancellationToken ct = default) {
if (!Is200) Logger?.LogWarning("Non-success response; attempting to read content.");
return Task.FromResult(_buffer);
if (_read_has_been_deferred) {
await ReadToBuffer(ct);
}
return _buffer;
}
public Task<Stream> AsStream(CancellationToken ct = default) {
public async Task<Stream> AsStream(CancellationToken ct = default) {
if (!Is200) Logger?.LogWarning("Non-success response; attempting to read content.");
return Task.FromResult<Stream>(new MemoryStream(_buffer, writable: false));
if (_read_has_been_deferred) {
return await Response.Content!.ReadAsStreamAsync(ct);
} else {
return new MemoryStream(_buffer, writable: false);
}
}
}
+152 -3
View File
@@ -1,5 +1,154 @@
using System.Text.Json;
using Beam.Abstractions;
using HtmlAgilityPack;
namespace Beam.Dynamic;
public class TableDataProvider {
}
public class TableDataProvider
: IComposableDataProvider<string>,
IComposableDataProvider<string[][]>
{
public IBinding? Content { get; set; }
/// <summary>
/// One provider per column. Each provider is executed per row.
/// Missing columns are filled with defaults that return the td/th text at that column index.
/// </summary>
public IDataProvider<string>[]? ColumnProviders { get; set; }
public string[][] Get(HtmlDocument document)
{
if (Content is null)
return [];
var node = Select(document);
if (node is null)
return [];
return Get(node);
}
string IDataProvider<string>.Get(HtmlDocument document)
{
var node = Select(document);
return node is null ? "" : (this as IComposableDataProvider<string>).Get(node);
}
public string[][] Get(HtmlNode node)
{
var rows = node.Descendants("tr").ToList();
if (rows.Count == 0)
return [];
// Determine how many columns we should output:
// max of provided providers length and max cell count across rows.
var maxCellsInAnyRow = rows
.Select(r => r.ChildNodes.Count(n => n.Name == "td" || n.Name == "th"))
.DefaultIfEmpty(0)
.Max();
var providedCount = ColumnProviders?.Length ?? 0;
var columnCount = Math.Max(providedCount, maxCellsInAnyRow);
if (columnCount == 0)
return [];
var effectiveProviders = BuildEffectiveProviders(columnCount);
var result = new string[rows.Count][];
for (int r = 0; r < rows.Count; r++)
{
var rowNode = rows[r];
var rowOut = new string[columnCount];
for (int c = 0; c < columnCount; c++)
{
var provider = effectiveProviders[c];
if (provider is IComposableDataProvider<string> composable)
{
// Execute with row context.
rowOut[c] = composable.Get(rowNode);
}
else
{
// Fallback to document context.
rowOut[c] = provider.Get(rowNode.OwnerDocument);
}
rowOut[c] ??= "";
}
result[r] = rowOut;
}
return result;
}
string IComposableDataProvider<string>.Get(HtmlNode node)
{
return JsonSerializer.Serialize(Get(node));
}
public HtmlNode? Select(HtmlDocument doc) => Content?.Select(doc);
HtmlNode? IComposableDataProvider<string[][]>.Select(HtmlNode node) => node;
HtmlNode? IComposableDataProvider<string>.Select(HtmlNode node) => node;
private IDataProvider<string>[] BuildEffectiveProviders(int columnCount)
{
var effective = new IDataProvider<string>[columnCount];
if (ColumnProviders is null || ColumnProviders.Length == 0)
{
for (int i = 0; i < columnCount; i++)
effective[i] = new ColumnCellContentsProvider(i);
return effective;
}
var maxCopy = Math.Min(ColumnProviders.Length, columnCount);
for (int i = 0; i < maxCopy; i++)
effective[i] = ColumnProviders[i] ?? new ColumnCellContentsProvider(i);
for (int i = maxCopy; i < columnCount; i++)
effective[i] = new ColumnCellContentsProvider(i);
return effective;
}
/// <summary>
/// Default column provider: for a given row, returns text of td/th at ColumnIndex.
/// </summary>
private sealed class ColumnCellContentsProvider : IComposableDataProvider<string>
{
public int ColumnIndex { get; }
public ColumnCellContentsProvider(int columnIndex)
{
ColumnIndex = columnIndex;
}
public string Get(HtmlDocument document)
{
var node = Select(document);
return node is null ? "" : Get(node);
}
public string Get(HtmlNode rowNode)
{
var cells = rowNode
.ChildNodes
.Where(n => n.Name == "td" || n.Name == "th")
.ToList();
if (ColumnIndex < 0 || ColumnIndex >= cells.Count)
return "";
return cells[ColumnIndex].InnerText;
}
public HtmlNode? Select(HtmlDocument doc) => doc.DocumentNode;
public HtmlNode? Select(HtmlNode node) => node;
}
}
+14 -2
View File
@@ -41,6 +41,8 @@ namespace Beam.Stealth {
o.SetPreference("pdfjs.disabled", true); // open PDFs externally
o.SetPreference("browser.download.manager.showWhenStarting", false);
o.PageLoadTimeout = TimeSpan.FromSeconds(10);
return o;
}
@@ -56,6 +58,8 @@ namespace Beam.Stealth {
// common stability flags
o.AddArgument("--no-sandbox");
o.AddArgument("--disable-dev-shm-usage");
o.PageLoadTimeout = TimeSpan.FromSeconds(10);
return o;
}
@@ -68,6 +72,8 @@ namespace Beam.Stealth {
o.AddUserProfilePreference("download.prompt_for_download", false);
o.AddUserProfilePreference("safebrowsing.enabled", false);
o.PageLoadTimeout = TimeSpan.FromSeconds(10);
return o;
}
@@ -80,7 +86,8 @@ namespace Beam.Stealth {
Browser preferredBrowser = Browser.Firefox,
string? remoteAddress = null,
Addon[]? utilityAddons = null,
ILogger? logger = null) {
ILogger? logger = null,
IWebDriver? copyCookiesFrom = null) {
// pick or create a dedicated download folder
downloadDir ??= Path.Combine(Path.GetTempPath(), Path.GetRandomFileName());
Directory.CreateDirectory(downloadDir);
@@ -130,6 +137,11 @@ namespace Beam.Stealth {
if (driver is null)
throw new AggregateException(errors);
if (copyCookiesFrom != null) {
foreach (var cookie in copyCookiesFrom.Manage().Cookies.AllCookies)
driver.Manage().Cookies.AddCookie(new Cookie(cookie.Name, cookie.Value, cookie.Domain, cookie.Path, cookie.Expiry));
}
return new StealthConfig(downloadDir) {
ShowBrowser = showBrowser,
@@ -139,7 +151,7 @@ namespace Beam.Stealth {
Driver = driver
};
}
public void Dispose() {
Driver.Dispose();
}
+10 -2
View File
@@ -10,6 +10,8 @@ using Beam.Abstractions;
using Beam.Downloaders;
using Beam.Models;
using Beam.Stealth.Strategies;
using OpenQA.Selenium;
using OpenQA.Selenium.Firefox;
namespace Beam.Stealth {
using File = System.IO.File;
@@ -35,9 +37,15 @@ namespace Beam.Stealth {
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
IProgress<IDownloadReport> progress, CancellationToken ct) {
var driver = Config.Driver;
await driver.Navigate().GoToUrlAsync(url);
await Manipulator(driver);
try {
await driver.Navigate().GoToUrlAsync(url);
}
catch (WebDriverTimeoutException) {
Logger?.LogWarning("Timeout navigating to {url}", url);
}
await Manipulator(driver);
await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, Config, Logger, ct);
}
+2 -2
View File
@@ -7,12 +7,12 @@
<Title>Beam</Title>
<Authors>aeqw89</Authors>
<Company>qwsdcvghyu</Company>
<Version>2.7.0</Version>
<Version>2.9.0</Version>
<Description>A library for downloading internet resources</Description>
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
<PackageId>aeqw89.Beam</PackageId>
<PackageVersion>2.7.0</PackageVersion>
<PackageVersion>2.9.0</PackageVersion>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\Beam.Api\Beam.Api.csproj">
+2 -2
View File
@@ -7,12 +7,12 @@
<Title>Beam</Title>
<Authors>aeqw89</Authors>
<Company>qwsdcvghyu</Company>
<Version>2.6.3</Version>
<Version>2.8.4</Version>
<Description>A library for downloading internet resources</Description>
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
<PackageId>aeqw89.Beam</PackageId>
<PackageVersion>2.6.3</PackageVersion>
<PackageVersion>2.8.4</PackageVersion>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\Beam.Api\Beam.Api.csproj">