feat: add deferred response buffering, TableDataProvider, and stealth improvements
- ApiResponse: add readToBuffer option to defer/stream body instead of eagerly buffering - TableDataProvider: implement HTML table parser with per-column provider support - StealthConfig: add 10s page load timeout and copyCookiesFrom parameter for cookie sharing - StealthUnitDownloader: catch WebDriverTimeoutException on navigation, log warning instead of throwing - Bump version to 2.9.0
This commit is contained in:
Generated
+8
@@ -8,6 +8,10 @@
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="a7e2f92c-8039-47f7-8b93-1c7d5c5d92cc" name="Changes" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/Beam.Dynamic/DataProviders/TableDataProvider.cs" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.idea/.idea.Beam/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/.idea.Beam/.idea/workspace.xml" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/Beam.Stealth/StealthConfig.cs" beforeDir="false" afterPath="$PROJECT_DIR$/Beam.Stealth/StealthConfig.cs" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/Beam.Stealth/StealthUnitDownloader.cs" beforeDir="false" afterPath="$PROJECT_DIR$/Beam.Stealth/StealthUnitDownloader.cs" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj" beforeDir="false" afterPath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj.bak" beforeDir="false" afterPath="$PROJECT_DIR$/aeqw89.Beam/aeqw89.Beam.csproj.bak" afterDir="false" />
|
||||
</list>
|
||||
@@ -44,11 +48,13 @@
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/ManyComposeDataProviders.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/RelationalDataProvider.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/SelectDataProvider.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Dynamic/DataProviders/TableDataProvider.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/AssertionException.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/Exceptions.Designer.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/Exceptions.resx" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Exceptions/MapException.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Models/ResourceDefinition.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
<setting file="file://$PROJECT_DIR$/Beam.Stealth/StealthConfig.cs" root0="FORCE_HIGHLIGHTING" />
|
||||
</component>
|
||||
<component name="MetaFilesCheckinStateConfiguration" checkMetaFiles="true" />
|
||||
<component name="ProblemsViewState">
|
||||
@@ -121,6 +127,8 @@
|
||||
<workItem from="1759845817258" duration="647000" />
|
||||
<workItem from="1760684473926" duration="4000" />
|
||||
<workItem from="1763031614537" duration="17900000" />
|
||||
<workItem from="1763822547278" duration="421000" />
|
||||
<workItem from="1763824632550" duration="6009000" />
|
||||
</task>
|
||||
<task id="LOCAL-00001" summary="refactor: modularize Beam into new projects and interfaces - Introduced modularity by splitting Beam into new projects: Beam.Abstractions, Beam.Models, and Beam.Downloaders. - Refactored existing classes into appropriate namespaces and projects. - Replaced specific implementations with abstractions (e.g., SourceLinkBuilder to LinkBuilder, State to IState, etc.). - Updated interfaces: added ITemplate, IArticleData, IDownloadReport, and others for improved extensibility. - Removed deprecated classes like SourceLinkBuilder and StateChangerFactory. - Enhanced link handling in downloaders by refactoring to use `string` over `SourceLink`. - Consolidated shared logic under Beam.Abstractions.">
|
||||
<option name="closed" value="true" />
|
||||
|
||||
+3
-3
@@ -22,7 +22,7 @@ namespace Beam.Api;
|
||||
|
||||
private string? ContentType = "application/json";
|
||||
|
||||
public async Task<ApiResponse> GetResponse(ILogger<ApiResponse>? logger, (int @try, int max)? tries = null, CancellationToken ct = default) {
|
||||
public async Task<ApiResponse> GetResponse(ILogger<ApiResponse>? logger, (int @try, int max)? tries = null, bool readToBuffer = true, CancellationToken ct = default) {
|
||||
SanitizeHeaders();
|
||||
|
||||
var request = new HttpRequestMessage(Method, Uri);
|
||||
@@ -40,10 +40,10 @@ namespace Beam.Api;
|
||||
|
||||
if (tries is not null && tries?.@try < tries?.max && !SuccessCodes.Contains(response.StatusCode)) {
|
||||
await Task.Delay((int)Math.Min(Math.Pow(2, tries.Value.@try), 60) * 1000, ct);
|
||||
return await GetResponse(logger, (tries.Value.@try + 1, tries.Value.max), ct);
|
||||
return await GetResponse(logger, (tries.Value.@try + 1, tries.Value.max), readToBuffer, ct);
|
||||
}
|
||||
|
||||
return await ApiResponse.CreateAsync(response, logger, RequestData, ct);
|
||||
return await ApiResponse.CreateAsync(response, logger, RequestData, readToBuffer, ct);
|
||||
}
|
||||
|
||||
private void SanitizeHeaders() {
|
||||
|
||||
@@ -33,7 +33,7 @@ namespace Beam.Api;
|
||||
// sequential
|
||||
var sequential = new List<ApiResponse>(_calls.Count);
|
||||
foreach (var call in _calls)
|
||||
sequential.Add(await call.GetResponse(logger, tries, ct));
|
||||
sequential.Add(await call.GetResponse(logger, tries, true, ct));
|
||||
return sequential;
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ namespace Beam.Api;
|
||||
_calls.Select((c, i) => (call: c, idx: i)),
|
||||
new ParallelOptions { MaxDegreeOfParallelism = _maxDegree, CancellationToken = ct },
|
||||
async (item, token) => {
|
||||
var response = await item.call.GetResponse(logger, tries, token);
|
||||
var response = await item.call.GetResponse(logger, tries, true, token);
|
||||
bag.Add((item.idx, response));
|
||||
});
|
||||
|
||||
|
||||
+36
-10
@@ -14,11 +14,13 @@ namespace Beam.Api;
|
||||
/// Wrapper that lets the response body be read any number of times (even concurrently).
|
||||
/// </summary>
|
||||
public sealed class ApiResponse {
|
||||
private readonly byte[] _buffer;
|
||||
private byte[] _buffer;
|
||||
private bool _read_has_been_deferred;
|
||||
|
||||
private ApiResponse(HttpResponseMessage response, byte[] buffer, ILogger<ApiResponse>? logger, object? requestData = null) {
|
||||
Response = response;
|
||||
_buffer = buffer;
|
||||
_read_has_been_deferred = _buffer.Length == 0;
|
||||
Logger = logger;
|
||||
RequestData = requestData;
|
||||
}
|
||||
@@ -33,8 +35,10 @@ namespace Beam.Api;
|
||||
HttpResponseMessage response,
|
||||
ILogger<ApiResponse>? logger = null,
|
||||
object? requestData = null,
|
||||
bool readToBuffer = true,
|
||||
CancellationToken ct = default) {
|
||||
if (response is null) throw new ArgumentNullException(nameof(response));
|
||||
if (!readToBuffer) return new ApiResponse(response, [], logger, requestData);
|
||||
|
||||
var buffer = response.Content is null
|
||||
? []
|
||||
@@ -55,32 +59,54 @@ namespace Beam.Api;
|
||||
if (!Is200) errorHandler(Response.StatusCode);
|
||||
return this;
|
||||
}
|
||||
|
||||
/* ---------- content helpers ---------- */
|
||||
|
||||
public Task<T?> AsSerializedObject<T>(CancellationToken ct = default) {
|
||||
private async Task ReadToBuffer(CancellationToken ct = default) {
|
||||
if (!_read_has_been_deferred) return;
|
||||
_buffer = Response.Content is null
|
||||
? []
|
||||
: await Response.Content.ReadAsByteArrayAsync(ct).ConfigureAwait(false);
|
||||
_read_has_been_deferred = false;
|
||||
}
|
||||
|
||||
public async Task<T?> AsSerializedObject<T>(CancellationToken ct = default) {
|
||||
if (!Is200) throw new InvalidOperationException();
|
||||
if (Response.Content?.Headers.ContentType?.MediaType != "application/json")
|
||||
Logger?.LogWarning("Content-Type is not JSON, yet JSON deserialization was requested.");
|
||||
|
||||
return Task.FromResult(JsonSerializer.Deserialize<T>(_buffer));
|
||||
if (_read_has_been_deferred) {
|
||||
return await JsonSerializer.DeserializeAsync<T>(await Response.Content!.ReadAsStreamAsync(ct), (JsonSerializerOptions?)null, ct);
|
||||
} else {
|
||||
return JsonSerializer.Deserialize<T>(_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
public Task<T?> AsDynamicObject<T>(T _, CancellationToken ct = default)
|
||||
=> AsSerializedObject<T>(ct);
|
||||
|
||||
public Task<string> AsString(CancellationToken ct = default) {
|
||||
public async Task<string> AsString(CancellationToken ct = default) {
|
||||
if (!Is200) Logger?.LogWarning("Non-success response; attempting to read content.");
|
||||
return Task.FromResult(Encoding.UTF8.GetString(_buffer));
|
||||
if (_read_has_been_deferred) {
|
||||
await ReadToBuffer(ct);
|
||||
}
|
||||
|
||||
return Encoding.UTF8.GetString(_buffer);
|
||||
}
|
||||
|
||||
public Task<byte[]> AsBinary(CancellationToken ct = default) {
|
||||
public async Task<byte[]> AsBinary(CancellationToken ct = default) {
|
||||
if (!Is200) Logger?.LogWarning("Non-success response; attempting to read content.");
|
||||
return Task.FromResult(_buffer);
|
||||
if (_read_has_been_deferred) {
|
||||
await ReadToBuffer(ct);
|
||||
}
|
||||
return _buffer;
|
||||
}
|
||||
|
||||
public Task<Stream> AsStream(CancellationToken ct = default) {
|
||||
public async Task<Stream> AsStream(CancellationToken ct = default) {
|
||||
if (!Is200) Logger?.LogWarning("Non-success response; attempting to read content.");
|
||||
return Task.FromResult<Stream>(new MemoryStream(_buffer, writable: false));
|
||||
if (_read_has_been_deferred) {
|
||||
return await Response.Content!.ReadAsStreamAsync(ct);
|
||||
} else {
|
||||
return new MemoryStream(_buffer, writable: false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,154 @@
|
||||
|
||||
using System.Text.Json;
|
||||
using Beam.Abstractions;
|
||||
using HtmlAgilityPack;
|
||||
|
||||
namespace Beam.Dynamic;
|
||||
|
||||
public class TableDataProvider {
|
||||
|
||||
}
|
||||
public class TableDataProvider
|
||||
: IComposableDataProvider<string>,
|
||||
IComposableDataProvider<string[][]>
|
||||
{
|
||||
public IBinding? Content { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// One provider per column. Each provider is executed per row.
|
||||
/// Missing columns are filled with defaults that return the td/th text at that column index.
|
||||
/// </summary>
|
||||
public IDataProvider<string>[]? ColumnProviders { get; set; }
|
||||
|
||||
public string[][] Get(HtmlDocument document)
|
||||
{
|
||||
if (Content is null)
|
||||
return [];
|
||||
|
||||
var node = Select(document);
|
||||
if (node is null)
|
||||
return [];
|
||||
|
||||
return Get(node);
|
||||
}
|
||||
|
||||
string IDataProvider<string>.Get(HtmlDocument document)
|
||||
{
|
||||
var node = Select(document);
|
||||
return node is null ? "" : (this as IComposableDataProvider<string>).Get(node);
|
||||
}
|
||||
|
||||
public string[][] Get(HtmlNode node)
|
||||
{
|
||||
var rows = node.Descendants("tr").ToList();
|
||||
if (rows.Count == 0)
|
||||
return [];
|
||||
|
||||
// Determine how many columns we should output:
|
||||
// max of provided providers length and max cell count across rows.
|
||||
var maxCellsInAnyRow = rows
|
||||
.Select(r => r.ChildNodes.Count(n => n.Name == "td" || n.Name == "th"))
|
||||
.DefaultIfEmpty(0)
|
||||
.Max();
|
||||
|
||||
var providedCount = ColumnProviders?.Length ?? 0;
|
||||
var columnCount = Math.Max(providedCount, maxCellsInAnyRow);
|
||||
|
||||
if (columnCount == 0)
|
||||
return [];
|
||||
|
||||
var effectiveProviders = BuildEffectiveProviders(columnCount);
|
||||
|
||||
var result = new string[rows.Count][];
|
||||
for (int r = 0; r < rows.Count; r++)
|
||||
{
|
||||
var rowNode = rows[r];
|
||||
var rowOut = new string[columnCount];
|
||||
|
||||
for (int c = 0; c < columnCount; c++)
|
||||
{
|
||||
var provider = effectiveProviders[c];
|
||||
|
||||
if (provider is IComposableDataProvider<string> composable)
|
||||
{
|
||||
// Execute with row context.
|
||||
rowOut[c] = composable.Get(rowNode);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fallback to document context.
|
||||
rowOut[c] = provider.Get(rowNode.OwnerDocument);
|
||||
}
|
||||
|
||||
rowOut[c] ??= "";
|
||||
}
|
||||
|
||||
result[r] = rowOut;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
string IComposableDataProvider<string>.Get(HtmlNode node)
|
||||
{
|
||||
return JsonSerializer.Serialize(Get(node));
|
||||
}
|
||||
|
||||
public HtmlNode? Select(HtmlDocument doc) => Content?.Select(doc);
|
||||
|
||||
HtmlNode? IComposableDataProvider<string[][]>.Select(HtmlNode node) => node;
|
||||
HtmlNode? IComposableDataProvider<string>.Select(HtmlNode node) => node;
|
||||
|
||||
private IDataProvider<string>[] BuildEffectiveProviders(int columnCount)
|
||||
{
|
||||
var effective = new IDataProvider<string>[columnCount];
|
||||
|
||||
if (ColumnProviders is null || ColumnProviders.Length == 0)
|
||||
{
|
||||
for (int i = 0; i < columnCount; i++)
|
||||
effective[i] = new ColumnCellContentsProvider(i);
|
||||
return effective;
|
||||
}
|
||||
|
||||
var maxCopy = Math.Min(ColumnProviders.Length, columnCount);
|
||||
for (int i = 0; i < maxCopy; i++)
|
||||
effective[i] = ColumnProviders[i] ?? new ColumnCellContentsProvider(i);
|
||||
|
||||
for (int i = maxCopy; i < columnCount; i++)
|
||||
effective[i] = new ColumnCellContentsProvider(i);
|
||||
|
||||
return effective;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default column provider: for a given row, returns text of td/th at ColumnIndex.
|
||||
/// </summary>
|
||||
private sealed class ColumnCellContentsProvider : IComposableDataProvider<string>
|
||||
{
|
||||
public int ColumnIndex { get; }
|
||||
|
||||
public ColumnCellContentsProvider(int columnIndex)
|
||||
{
|
||||
ColumnIndex = columnIndex;
|
||||
}
|
||||
|
||||
public string Get(HtmlDocument document)
|
||||
{
|
||||
var node = Select(document);
|
||||
return node is null ? "" : Get(node);
|
||||
}
|
||||
|
||||
public string Get(HtmlNode rowNode)
|
||||
{
|
||||
var cells = rowNode
|
||||
.ChildNodes
|
||||
.Where(n => n.Name == "td" || n.Name == "th")
|
||||
.ToList();
|
||||
|
||||
if (ColumnIndex < 0 || ColumnIndex >= cells.Count)
|
||||
return "";
|
||||
|
||||
return cells[ColumnIndex].InnerText;
|
||||
}
|
||||
|
||||
public HtmlNode? Select(HtmlDocument doc) => doc.DocumentNode;
|
||||
public HtmlNode? Select(HtmlNode node) => node;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,6 +41,8 @@ namespace Beam.Stealth {
|
||||
o.SetPreference("pdfjs.disabled", true); // open PDFs externally
|
||||
o.SetPreference("browser.download.manager.showWhenStarting", false);
|
||||
|
||||
o.PageLoadTimeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
@@ -56,6 +58,8 @@ namespace Beam.Stealth {
|
||||
// common stability flags
|
||||
o.AddArgument("--no-sandbox");
|
||||
o.AddArgument("--disable-dev-shm-usage");
|
||||
|
||||
o.PageLoadTimeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
return o;
|
||||
}
|
||||
@@ -68,6 +72,8 @@ namespace Beam.Stealth {
|
||||
o.AddUserProfilePreference("download.prompt_for_download", false);
|
||||
o.AddUserProfilePreference("safebrowsing.enabled", false);
|
||||
|
||||
o.PageLoadTimeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
@@ -80,7 +86,8 @@ namespace Beam.Stealth {
|
||||
Browser preferredBrowser = Browser.Firefox,
|
||||
string? remoteAddress = null,
|
||||
Addon[]? utilityAddons = null,
|
||||
ILogger? logger = null) {
|
||||
ILogger? logger = null,
|
||||
IWebDriver? copyCookiesFrom = null) {
|
||||
// pick or create a dedicated download folder
|
||||
downloadDir ??= Path.Combine(Path.GetTempPath(), Path.GetRandomFileName());
|
||||
Directory.CreateDirectory(downloadDir);
|
||||
@@ -130,6 +137,11 @@ namespace Beam.Stealth {
|
||||
|
||||
if (driver is null)
|
||||
throw new AggregateException(errors);
|
||||
|
||||
if (copyCookiesFrom != null) {
|
||||
foreach (var cookie in copyCookiesFrom.Manage().Cookies.AllCookies)
|
||||
driver.Manage().Cookies.AddCookie(new Cookie(cookie.Name, cookie.Value, cookie.Domain, cookie.Path, cookie.Expiry));
|
||||
}
|
||||
|
||||
return new StealthConfig(downloadDir) {
|
||||
ShowBrowser = showBrowser,
|
||||
@@ -139,7 +151,7 @@ namespace Beam.Stealth {
|
||||
Driver = driver
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public void Dispose() {
|
||||
Driver.Dispose();
|
||||
}
|
||||
|
||||
@@ -10,6 +10,8 @@ using Beam.Abstractions;
|
||||
using Beam.Downloaders;
|
||||
using Beam.Models;
|
||||
using Beam.Stealth.Strategies;
|
||||
using OpenQA.Selenium;
|
||||
using OpenQA.Selenium.Firefox;
|
||||
|
||||
namespace Beam.Stealth {
|
||||
using File = System.IO.File;
|
||||
@@ -35,9 +37,15 @@ namespace Beam.Stealth {
|
||||
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
|
||||
IProgress<IDownloadReport> progress, CancellationToken ct) {
|
||||
var driver = Config.Driver;
|
||||
await driver.Navigate().GoToUrlAsync(url);
|
||||
await Manipulator(driver);
|
||||
try {
|
||||
await driver.Navigate().GoToUrlAsync(url);
|
||||
}
|
||||
catch (WebDriverTimeoutException) {
|
||||
Logger?.LogWarning("Timeout navigating to {url}", url);
|
||||
}
|
||||
|
||||
await Manipulator(driver);
|
||||
|
||||
await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, Config, Logger, ct);
|
||||
}
|
||||
|
||||
|
||||
@@ -7,12 +7,12 @@
|
||||
<Title>Beam</Title>
|
||||
<Authors>aeqw89</Authors>
|
||||
<Company>qwsdcvghyu</Company>
|
||||
<Version>2.7.0</Version>
|
||||
<Version>2.9.0</Version>
|
||||
<Description>A library for downloading internet resources</Description>
|
||||
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
|
||||
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
|
||||
<PackageId>aeqw89.Beam</PackageId>
|
||||
<PackageVersion>2.7.0</PackageVersion>
|
||||
<PackageVersion>2.9.0</PackageVersion>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Beam.Api\Beam.Api.csproj">
|
||||
|
||||
@@ -7,12 +7,12 @@
|
||||
<Title>Beam</Title>
|
||||
<Authors>aeqw89</Authors>
|
||||
<Company>qwsdcvghyu</Company>
|
||||
<Version>2.6.3</Version>
|
||||
<Version>2.8.4</Version>
|
||||
<Description>A library for downloading internet resources</Description>
|
||||
<PackageProjectUrl>https://github.com/qwsdcvghyu89/Beam</PackageProjectUrl>
|
||||
<RepositoryUrl>https://github.com/qwsdcvghyu89/Beam</RepositoryUrl>
|
||||
<PackageId>aeqw89.Beam</PackageId>
|
||||
<PackageVersion>2.6.3</PackageVersion>
|
||||
<PackageVersion>2.8.4</PackageVersion>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Beam.Api\Beam.Api.csproj">
|
||||
|
||||
Reference in New Issue
Block a user