export declare function extractArrayFromPage (
page : Page ,
options : {
label : string ;
itemEntityName : string ;
itemEntitySchema : SimpleArrayItemSchema ;
strategy ?: ImageStrategy | HtmlStrategy ;
prompt ?: string ;
optionalPropertiesInvalidator ?: (
result : Record < string , string >[]
) => string [];
variantKey ?: string ;
apiKey ?: string ;
}
) : Promise < Record < string , string >[]>;
Deprecated: This function is deprecated and will be removed in the future.
Extracts an array of structured data from a web page in an optimized way, this function will use ai for the first n times, until it collects multiple examples
then it will build reliable selectors in the background to make the process more efficient
Examples
import { extractArrayFromPage } from "@intuned/sdk/optimized-extractors" ;
await page . goto ( "https://books.toscrape.com/" )
const books = await extractArrayFromPage ( page ,
{
strategy: {
model: "gpt4-turbo" ,
type: "HTML"
},
itemEntityName: "book" ,
label: "books-extraction" ,
itemEntitySchema: {
type: "object" ,
required: [ "name" ],
properties: {
name: {
type: "string" ,
description: "book name" ,
primary: true
}
}
}
},
)
console . log ( books )
// output:
// [
// ...
// { name: 'Olio' },
// { name: 'Mesaerion: The Best Science Fiction Stories 1800-1849' },
// { name: 'Libertarianism for Beginners' },
// { name: "It's Only the Himalayas" }
// ...
// ]
Arguments
The Playwright Page object from which to extract the data.
A label for this extraction process, used for billing and monitoring.
The name of the entity items being extracted, it must be between 1 and 50 characters long and can only contain letters, digits, periods, underscores, and hyphens.
The schema of the entity items being extracted.
Optional. The strategy to use for extraction, if not provided, the html strategy with claude haiku will be used.
Optional. A prompt to guide the extraction process.
options.optionalPropertiesInvalidator
Optional. A function to invalidate optional properties.
Optional. A variant key for the extraction process, use this when the page has multiple variants/shapes.
Optional. An API key to use for the AI extraction. Extractions made with you API key will not be billed to your account.
Returns: any
A promise that resolves to a list of extracted data.