Refactor downloaders to use ByteDocument and add options builders
Replaces generic RawType with ByteDocument in downloaders and context classes, simplifying type usage. Adds builder classes for FailurePredicateOptions, FragmentOptions, SkipPredicateOptions, and UnitDownloaderOptions to improve configuration flexibility. Introduces DownloadTarget enum and SkipPredicate delegate for more granular download control. Refactors Fluent API interfaces and implementations to remove RawType generics and streamline usage. Adds Playwright and Stealth download strategies for extensibility.
This commit is contained in:
@@ -9,7 +9,7 @@ using Beam.Downloaders;
|
||||
using Beam.Models;
|
||||
|
||||
namespace Beam.Stealth {
|
||||
public class StealthFragmentDownloader<RawType, OutType> : UnitFragmentDownloader<RawType, OutType> where RawType : IDocument {
|
||||
public StealthFragmentDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader<RawType, OutType>(options, config, manipulator)) {}
|
||||
public class StealthFragmentDownloader<OutType> : UnitFragmentDownloader<OutType> {
|
||||
public StealthFragmentDownloader(UnitDownloaderOptions<OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitDownloader<OutType>(options, config, manipulator)) {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
using HtmlAgilityPack;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Beam.Abstractions;
|
||||
using Beam.Downloaders;
|
||||
using Beam.Models;
|
||||
|
||||
namespace Beam.Stealth {
|
||||
public class StealthFragmentPageDownloader<RawType, OutType> : UnitFragmentDownloader<RawType, OutType> where RawType : IDocument {
|
||||
public StealthFragmentPageDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options, new StealthUnitPageDownloader<RawType, OutType>(options, config, manipulator)) {}
|
||||
}
|
||||
}
|
||||
@@ -9,18 +9,27 @@ using System.Threading.Tasks;
|
||||
using Beam.Abstractions;
|
||||
using Beam.Downloaders;
|
||||
using Beam.Models;
|
||||
using Beam.Stealth.Strategies;
|
||||
|
||||
namespace Beam.Stealth {
|
||||
using File = System.IO.File;
|
||||
|
||||
public class StealthUnitDownloader<RawType, OutType> : UnitDownloader<RawType, OutType> where RawType : IDocument {
|
||||
public class StealthUnitDownloader<OutType> : UnitDownloader<OutType> {
|
||||
public StealthConfig Config { get; }
|
||||
public StealthAsyncManipulator Manipulator { get; }
|
||||
private ILogger? Logger => Config.Logger;
|
||||
|
||||
public StealthUnitDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) {
|
||||
private IDownloadStrategy _downloadStrategy { get; }
|
||||
|
||||
public StealthUnitDownloader(UnitDownloaderOptions<OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) {
|
||||
Config = config;
|
||||
Manipulator = manipulator;
|
||||
|
||||
_downloadStrategy = options.Target switch {
|
||||
DownloadTarget.URL or DownloadTarget.InURL => new PageDownloadStrategy(),
|
||||
DownloadTarget.Complex => new WaitingDownloadStrategy(),
|
||||
_ => throw new NotSupportedException() // TODO add an exception message
|
||||
};
|
||||
}
|
||||
|
||||
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
|
||||
@@ -29,76 +38,7 @@ namespace Beam.Stealth {
|
||||
await driver.Navigate().GoToUrlAsync(url);
|
||||
await Manipulator(driver);
|
||||
|
||||
await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), ct);
|
||||
await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask);
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
private async Task<Stream?> WaitForDownloadAsync(
|
||||
string link, IProgress<IDownloadReport> progress, Stopwatch sw, CancellationToken ct) {
|
||||
const int PollDelayMs = 250; // how often we look
|
||||
const int StableDelayMs = 1000; // size-unchanged window
|
||||
|
||||
string dir = Config.DownloadsDirectory;
|
||||
string? finalPath = null;
|
||||
long lastSize = -1;
|
||||
DateTime lastChange = DateTime.UtcNow;
|
||||
|
||||
bool IsTemp(string p) =>
|
||||
p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) ||
|
||||
p.EndsWith(".part", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
Logger?.LogDebug("Polling {Dir} for download files", dir);
|
||||
|
||||
while (sw.Elapsed < Config.TimeOut && !ct.IsCancellationRequested) {
|
||||
// current files in the directory
|
||||
var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray();
|
||||
|
||||
// ignore temp names; pick (or re-pick) the first real candidate
|
||||
finalPath ??= files.FirstOrDefault(f => !IsTemp(f));
|
||||
|
||||
// still nothing but temps – keep waiting
|
||||
if (finalPath is null) {
|
||||
await Task.Delay(PollDelayMs, ct);
|
||||
continue;
|
||||
}
|
||||
|
||||
// track growth
|
||||
long size = new FileInfo(finalPath).Length;
|
||||
if (size == 0 || size != lastSize) {
|
||||
progress?.Report(new DownloadReport() {
|
||||
BytesDownloaded = size - lastSize,
|
||||
});
|
||||
lastSize = size;
|
||||
lastChange = DateTime.UtcNow;
|
||||
await Task.Delay(PollDelayMs, ct);
|
||||
continue;
|
||||
}
|
||||
|
||||
// size stable long enough *and* no temp files left?
|
||||
bool tempsRemain = files.Any(IsTemp);
|
||||
if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) {
|
||||
await Task.Delay(PollDelayMs, ct);
|
||||
continue;
|
||||
}
|
||||
|
||||
// wait until writer releases lock
|
||||
while (true) {
|
||||
try {
|
||||
using FileStream _ =
|
||||
File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None);
|
||||
break;
|
||||
} catch (IOException) {
|
||||
await Task.Delay(200, ct);
|
||||
}
|
||||
}
|
||||
|
||||
return File.OpenRead(finalPath);
|
||||
}
|
||||
|
||||
Logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed);
|
||||
return null;
|
||||
await _downloadStrategy.DownloadToStream(url, bufferSize, destinationStream, progress, Config, Logger, ct);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
using HtmlAgilityPack;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Beam.Abstractions;
|
||||
using Beam.Downloaders;
|
||||
using Beam.Models;
|
||||
|
||||
namespace Beam.Stealth {
|
||||
public class StealthUnitPageDownloader<RawType, OutType> : UnitDownloader<RawType, OutType> where RawType : IDocument {
|
||||
public StealthConfig Config { get; }
|
||||
public StealthAsyncManipulator Manipulator { get; }
|
||||
private ILogger? Logger => Config.Logger;
|
||||
|
||||
public StealthUnitPageDownloader(UnitDownloaderOptions<RawType, OutType> options, StealthConfig config, StealthAsyncManipulator manipulator) : base(options) {
|
||||
Config = config;
|
||||
Manipulator = manipulator;
|
||||
}
|
||||
|
||||
protected override async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, CancellationToken ct) {
|
||||
var driver = Config.Driver;
|
||||
|
||||
await driver.Navigate().GoToUrlAsync(url);
|
||||
await Manipulator(driver);
|
||||
|
||||
byte[] bytes = Encoding.UTF8.GetBytes(driver.PageSource);
|
||||
await destinationStream.WriteAsync(bytes, ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
using Beam.Abstractions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace Beam.Stealth.Strategies;
|
||||
|
||||
internal interface IDownloadStrategy {
|
||||
Task DownloadToStream(string url, int bufferSize, Stream destinationStream,
|
||||
IProgress<IDownloadReport> progress, StealthConfig config, ILogger? logger, CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
using System.Text;
|
||||
using Beam.Abstractions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace Beam.Stealth.Strategies;
|
||||
|
||||
internal class PageDownloadStrategy : IDownloadStrategy {
|
||||
public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, StealthConfig config,
|
||||
ILogger? logger, CancellationToken ct) {
|
||||
byte[] bytes = Encoding.UTF8.GetBytes(config.Driver.PageSource);
|
||||
await destinationStream.WriteAsync(bytes, ct);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
using System.Diagnostics;
|
||||
using Beam.Abstractions;
|
||||
using Beam.Models;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using File = System.IO.File;
|
||||
|
||||
namespace Beam.Stealth.Strategies;
|
||||
|
||||
public class WaitingDownloadStrategy : IDownloadStrategy {
|
||||
public async Task DownloadToStream(string url, int bufferSize, Stream destinationStream, IProgress<IDownloadReport> progress, StealthConfig config,
|
||||
ILogger? logger, CancellationToken ct) {
|
||||
await using var stream = await WaitForDownloadAsync(url, progress, Stopwatch.StartNew(), config, logger, ct);
|
||||
await (stream?.CopyToAsync(destinationStream, ct) ?? Task.CompletedTask);
|
||||
}
|
||||
|
||||
|
||||
private async Task<Stream?> WaitForDownloadAsync(
|
||||
string link, IProgress<IDownloadReport> progress, Stopwatch sw, StealthConfig config, ILogger? logger, CancellationToken ct) {
|
||||
const int PollDelayMs = 250; // how often we look
|
||||
const int StableDelayMs = 1000; // size-unchanged window
|
||||
|
||||
string dir = config.DownloadsDirectory;
|
||||
string? finalPath = null;
|
||||
long lastSize = -1;
|
||||
DateTime lastChange = DateTime.UtcNow;
|
||||
|
||||
bool IsTemp(string p) =>
|
||||
p.EndsWith(".crdownload", StringComparison.OrdinalIgnoreCase) ||
|
||||
p.EndsWith(".part", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
logger?.LogDebug("Polling {Dir} for download files", dir);
|
||||
|
||||
while (sw.Elapsed < config.TimeOut && !ct.IsCancellationRequested) {
|
||||
// current files in the directory
|
||||
var files = Directory.EnumerateFiles(dir, "*", SearchOption.TopDirectoryOnly).ToArray();
|
||||
|
||||
// ignore temp names; pick (or re-pick) the first real candidate
|
||||
finalPath ??= files.FirstOrDefault(f => !IsTemp(f));
|
||||
|
||||
// still nothing but temps – keep waiting
|
||||
if (finalPath is null) {
|
||||
await Task.Delay(PollDelayMs, ct);
|
||||
continue;
|
||||
}
|
||||
|
||||
// track growth
|
||||
long size = new FileInfo(finalPath).Length;
|
||||
if (size == 0 || size != lastSize) {
|
||||
progress?.Report(new DownloadReport() {
|
||||
BytesDownloaded = size - lastSize,
|
||||
});
|
||||
lastSize = size;
|
||||
lastChange = DateTime.UtcNow;
|
||||
await Task.Delay(PollDelayMs, ct);
|
||||
continue;
|
||||
}
|
||||
|
||||
// size stable long enough *and* no temp files left?
|
||||
bool tempsRemain = files.Any(IsTemp);
|
||||
if ((DateTime.UtcNow - lastChange).TotalMilliseconds < StableDelayMs || tempsRemain) {
|
||||
await Task.Delay(PollDelayMs, ct);
|
||||
continue;
|
||||
}
|
||||
|
||||
// wait until writer releases lock
|
||||
while (true) {
|
||||
try {
|
||||
using FileStream _ =
|
||||
File.Open(finalPath, FileMode.Open, FileAccess.Read, FileShare.None);
|
||||
break;
|
||||
} catch (IOException) {
|
||||
await Task.Delay(200, ct);
|
||||
}
|
||||
}
|
||||
|
||||
return File.OpenRead(finalPath);
|
||||
}
|
||||
|
||||
logger?.LogWarning("Download timed out after {Elapsed}", sw.Elapsed);
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user