Documentation
¶
Index ¶
- Constants
- Variables
- func GetLogger() *slog.Logger
- func Is(err error, errorType ErrorType) bool
- func RegisterPipe(name string, fn PipeFunc)
- func Scrape[T any](ctx context.Context, html string, config *Config) ([]T, error)
- func ScrapeURL[T any](ctx context.Context, url string, config *Config) ([]T, error)
- func ScrapeURLUntyped(ctx context.Context, url string, config *Config) ([]map[string]any, error)
- func ScrapeUntyped(ctx context.Context, html string, config *Config) ([]map[string]any, error)
- func SetLogLevel(level slog.Level)
- func SetLogger(logger *slog.Logger)
- func ValidateXPath(html string, xpaths map[string]string) map[string]ValidationResult
- func ValidateXPathURL(url string, config *Config) (map[string]ValidationResult, error)
- func WithURL(ctx context.Context, url string) context.Context
- type Config
- type ConfigFormat
- type EnvMapping
- type ErrorType
- type FieldConfig
- type HealthCheckResult
- type HealthStatus
- type PageResult
- type PaginatedResults
- type PaginationConfig
- type PaginationError
- type PaginationInfo
- type PartialResult
- type PipeError
- type PipeFunc
- type ScrapeError
- type ValidationResult
Constants ¶
const ( DefaultMaxPages = 100 DefaultPaginationTimeout = 10 * time.Minute )
Default pagination configuration
Variables ¶
var DefaultEnvMapping = &EnvMapping{
Timeout: "GTMLP_TIMEOUT",
UserAgent: "GTMLP_USER_AGENT",
RandomUA: "GTMLP_RANDOM_UA",
MaxRetries: "GTMLP_MAX_RETRIES",
Proxy: "GTMLP_PROXY",
}
DefaultEnvMapping provides default env var names
Functions ¶
func RegisterPipe ¶
RegisterPipe registers a custom pipe function
func Scrape ¶
Scrape extracts data from HTML using XPath with a typed result. It finds all container nodes and extracts fields from each one. Returns an empty slice if no containers are found.
func ScrapeURLUntyped ¶
ScrapeURLUntyped fetches a URL and scrapes it, returning maps (no type parameter)
func ScrapeUntyped ¶
ScrapeUntyped extracts data from HTML using XPath, returning map slices. It finds all container nodes and extracts fields from each one. Returns an empty slice if no containers are found.
func SetLogLevel ¶
SetLogLevel changes the global log level by creating a new default handler Available levels: slog.LevelDebug, slog.LevelInfo, slog.LevelWarn, slog.LevelError
Default: slog.LevelWarn (production-safe)
Note: This recreates the handler with default settings (TextHandler to stderr). If you're using a custom handler (custom writer, JSON format, etc.), use SetLogger instead.
Example:
// Development: enable Info logs
gtmlp.SetLogLevel(slog.LevelInfo)
// Troubleshooting: enable Debug logs
gtmlp.SetLogLevel(slog.LevelDebug)
// Production: use default Warn level (no call needed)
// For custom handlers, use SetLogger:
handler := slog.NewJSONHandler(myWriter, &slog.HandlerOptions{Level: slog.LevelDebug})
gtmlp.SetLogger(slog.New(handler))
func SetLogger ¶
SetLogger configures the global logger Use this to customize the logger handler (JSON vs Text, output destination, etc.)
Example:
handler := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
})
gtmlp.SetLogger(slog.New(handler))
func ValidateXPath ¶
func ValidateXPath(html string, xpaths map[string]string) map[string]ValidationResult
ValidateXPath validates XPath expressions against HTML
func ValidateXPathURL ¶
func ValidateXPathURL(url string, config *Config) (map[string]ValidationResult, error)
ValidateXPathURL validates XPath expressions from a URL
Types ¶
type Config ¶
type Config struct {
// XPath definitions
Container string // Repeating element selector
AltContainer []string // Alternative container selectors
Fields map[string]FieldConfig // Field name → FieldConfig
// Pagination
Pagination *PaginationConfig // Optional pagination configuration
// Security options
URLValidator func(string) error // Optional custom URL validation function
AllowPrivateIPs bool // Allow scraping private/internal IPs (default: false)
// HTTP options
Timeout time.Duration
UserAgent string
RandomUA bool
MaxRetries int
Proxy string
Headers map[string]string
}
Config holds scraping configuration
func LoadConfig ¶
func LoadConfig(path string, envMapping *EnvMapping) (*Config, error)
LoadConfig loads selector config from file (JSON/YAML auto-detected)
func ParseConfig ¶
func ParseConfig(data string, format ConfigFormat, envMapping *EnvMapping) (*Config, error)
ParseConfig parses config from string
type ConfigFormat ¶
type ConfigFormat string
ConfigFormat specifies file format
const ( FormatJSON ConfigFormat = "json" FormatYAML ConfigFormat = "yaml" )
type EnvMapping ¶
type EnvMapping struct {
Timeout string
UserAgent string
RandomUA string
MaxRetries string
Proxy string
}
EnvMapping defines configurable environment variable names
type FieldConfig ¶
FieldConfig defines a single field's XPath and optional pipes
type HealthCheckResult ¶
type HealthCheckResult struct {
URL string // The URL that was checked
Status HealthStatus // The health status of the URL
Code int // HTTP status code (0 if error occurred)
Latency time.Duration // Time taken for the health check
Error error // Error message if check failed
}
HealthCheckResult represents the result of a health check
func CheckHealth ¶
func CheckHealth(url string) HealthCheckResult
CheckHealth performs a health check on a single URL
func CheckHealthMulti ¶
func CheckHealthMulti(urls []string) []HealthCheckResult
CheckHealthMulti performs health checks on multiple URLs concurrently
func CheckHealthWithOptions ¶
func CheckHealthWithOptions(url string, config *Config) HealthCheckResult
CheckHealthWithOptions performs a health check on a single URL with custom configuration
type HealthStatus ¶
type HealthStatus int
HealthStatus represents the health status of a URL
const ( // StatusHealthy indicates the URL returned a 2xx status code StatusHealthy HealthStatus = iota // StatusUnhealthy indicates the URL returned a 4xx or 5xx status code StatusUnhealthy // StatusError indicates there was a network or other error StatusError )
func (HealthStatus) String ¶
func (s HealthStatus) String() string
String returns the string representation of HealthStatus
type PageResult ¶
PageResult contains results from a single page
type PaginatedResults ¶
type PaginatedResults[T any] struct { Pages []PageResult[T] TotalPages int TotalItems int }
PaginatedResults contains page-separated scraping results
func ScrapeURLWithPages ¶
func ScrapeURLWithPages[T any](ctx context.Context, url string, config *Config) (*PaginatedResults[T], error)
ScrapeURLWithPages fetches a URL and scrapes it with pagination, returning page-separated results
type PaginationConfig ¶
type PaginationConfig struct {
Type string // "next-link" or "numbered"
NextSelector string // XPath for next link (next-link type)
AltSelectors []string // Fallback selectors for next link
PageSelector string // XPath for all page links (numbered type)
Pipes []string // URL transformation pipes
MaxPages int // Maximum pages to scrape (default: 100)
Timeout time.Duration // Total pagination timeout (default: 10m)
}
PaginationConfig defines pagination behavior
type PaginationError ¶
type PaginationError struct {
PageURL string // URL that failed
PageNumber int // Page number (1-indexed)
PartialData any // Items scraped before failure
TotalScraped int // Total items before failure
Cause error // Underlying error
}
PaginationError represents an error during pagination
func (*PaginationError) Error ¶
func (e *PaginationError) Error() string
type PaginationInfo ¶
type PaginationInfo struct {
URLs []string // All discovered page URLs
Type string // "next-link" or "numbered"
BaseURL string // Original base URL
}
PaginationInfo contains extracted pagination URLs
func ExtractPaginationURLs ¶
func ExtractPaginationURLs(ctx context.Context, url string, config *Config) (*PaginationInfo, error)
ExtractPaginationURLs extracts all pagination URLs without scraping
type PartialResult ¶
PartialResult contains data and field-level errors
type ScrapeError ¶
ScrapeError is a typed error with context
func (*ScrapeError) Error ¶
func (e *ScrapeError) Error() string
func (*ScrapeError) Unwrap ¶
func (e *ScrapeError) Unwrap() error
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
examples
|
|
|
v2/basic_json
command
|
|
|
v2/basic_yaml
command
|
|
|
v2/ecommerce_json
command
|
|
|
v2/ecommerce_yaml
command
|
|
|
v2/embed_json
command
|
|
|
v2/embed_yaml
command
|
|
|
v2/pagination_next_json
command
|
|
|
v2/pagination_numbered_yaml
command
|
|
|
v2/table_json
command
|
|
|
v2/table_yaml
command
|