// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. import { Field, Schema } from './schema.js'; import * as dtypes from './type.js'; import { Data, DataProps } from './data.js'; import { BuilderType, JavaScriptDataType } from './interfaces.js'; import { Vector, makeVector } from './vector.js'; import { Builder, BuilderOptions } from './builder.js'; import { instance as getBuilderConstructor } from './visitor/builderctor.js'; import { ArrayDataType, BigIntArray, JavaScriptArrayDataType, TypedArray, TypedArrayDataType } from './interfaces.js'; import { Table } from './table.js'; import { RecordBatch } from './recordbatch.js'; import { compareTypes } from './visitor/typecomparator.js'; export function makeBuilder(options: BuilderOptions): BuilderType { const type = options.type; const builder = new (getBuilderConstructor.getVisitFn(type)())(options) as Builder; if (type.children && type.children.length > 0) { const children = options['children'] || [] as BuilderOptions[]; const defaultOptions = { 'nullValues': options['nullValues'] }; const getChildOptions = Array.isArray(children) ? ((_: Field, i: number) => children[i] || defaultOptions) : (({ name }: Field) => children[name] || defaultOptions); for (const [index, field] of type.children.entries()) { const { type } = field; const opts = getChildOptions(field, index); builder.children.push(makeBuilder({ ...opts, type })); } } return builder as BuilderType; } /** * Creates a Vector from a JavaScript array via a {@link Builder}. * Use {@link makeVector} if you only want to create a vector from a typed array. * * @example * ```ts * const vf64 = vectorFromArray([1, 2, 3]); * const vi8 = vectorFromArray([1, 2, 3], new Int8); * const vdict = vectorFromArray(['foo', 'bar']); * const vstruct = vectorFromArray([{a: 'foo', b: 42}, {a: 'bar', b: 12}]); * ``` */ export function vectorFromArray(values: readonly (null | undefined)[], type?: dtypes.Null): Vector; export function vectorFromArray(values: readonly (null | undefined | boolean)[], type?: dtypes.Bool): Vector; export function vectorFromArray = dtypes.Dictionary>(values: readonly (null | undefined | string)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | Date)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type: T): Vector; export function vectorFromArray(values: readonly (null | undefined | bigint)[], type?: T): Vector; export function vectorFromArray(values: readonly (null | undefined | number)[], type?: T): Vector; export function vectorFromArray(values: readonly (unknown)[], type: T): Vector; export function vectorFromArray(values: T): Vector>; /** Creates a Vector from a typed array via {@link makeVector}. */ export function vectorFromArray(data: T): Vector>; export function vectorFromArray(data: Data): Vector; export function vectorFromArray(data: Vector): Vector; export function vectorFromArray(data: DataProps): Vector; export function vectorFromArray(data: T): Vector>; export function vectorFromArray(init: any, type?: dtypes.DataType) { if (init instanceof Data || init instanceof Vector || init.type instanceof dtypes.DataType || ArrayBuffer.isView(init)) { return makeVector(init as any); } const options: IterableBuilderOptions = { type: type ?? inferType(init), nullValues: [null] }; const chunks = [...builderThroughIterable(options)(init)]; const vector = chunks.length === 1 ? chunks[0] : chunks.reduce((a, b) => a.concat(b)); if (dtypes.DataType.isDictionary(vector.type)) { return vector.memoize(); } return vector; } /** * Creates a {@link Table} from an array of objects. * * @param array A table of objects. */ export function tableFromJSON>(array: T[]): Table<{ [P in keyof T]: JavaScriptDataType }> { const vector = vectorFromArray(array) as Vector>; const batch = new RecordBatch(new Schema(vector.type.children), vector.data[0]); return new Table(batch); } /** @ignore */ function inferType(values: T): JavaScriptArrayDataType; function inferType(value: readonly unknown[]): dtypes.DataType { if (value.length === 0) { return new dtypes.Null; } let nullsCount = 0; let arraysCount = 0; let objectsCount = 0; let numbersCount = 0; let stringsCount = 0; let bigintsCount = 0; let booleansCount = 0; let datesCount = 0; for (const val of value) { if (val == null) { ++nullsCount; continue; } switch (typeof val) { case 'bigint': ++bigintsCount; continue; case 'boolean': ++booleansCount; continue; case 'number': ++numbersCount; continue; case 'string': ++stringsCount; continue; case 'object': if (Array.isArray(val)) { ++arraysCount; } else if (Object.prototype.toString.call(val) === '[object Date]') { ++datesCount; } else { ++objectsCount; } continue; } throw new TypeError('Unable to infer Vector type from input values, explicit type declaration expected'); } if (numbersCount + nullsCount === value.length) { return new dtypes.Float64; } else if (stringsCount + nullsCount === value.length) { return new dtypes.Dictionary(new dtypes.Utf8, new dtypes.Int32); } else if (bigintsCount + nullsCount === value.length) { return new dtypes.Int64; } else if (booleansCount + nullsCount === value.length) { return new dtypes.Bool; } else if (datesCount + nullsCount === value.length) { return new dtypes.DateMillisecond; } else if (arraysCount + nullsCount === value.length) { const array = value as Array[]; const childType = inferType(array[array.findIndex((ary) => ary != null)]); if (array.every((ary) => ary == null || compareTypes(childType, inferType(ary)))) { return new dtypes.List(new Field('', childType, true)); } } else if (objectsCount + nullsCount === value.length) { const fields = new Map(); for (const row of value as Record[]) { for (const key of Object.keys(row)) { if (!fields.has(key) && row[key] != null) { // use the type inferred for the first instance of a found key fields.set(key, new Field(key, inferType([row[key]]), true)); } } } return new dtypes.Struct([...fields.values()]); } throw new TypeError('Unable to infer Vector type from input values, explicit type declaration expected'); } /** * A set of options to create an Iterable or AsyncIterable `Builder` transform function. * @see {@link builderThroughIterable} * @see {@link builderThroughAsyncIterable} */ export interface IterableBuilderOptions extends BuilderOptions { highWaterMark?: number; queueingStrategy?: 'bytes' | 'count'; dictionaryHashFunction?: (value: any) => string | number; valueToChildTypeId?: (builder: Builder, value: any, offset: number) => number; } /** @ignore */ type ThroughIterable = (source: Iterable) => IterableIterator>; /** * Transform a synchronous `Iterable` of arbitrary JavaScript values into a * sequence of Arrow Vector following the chunking semantics defined in * the supplied `options` argument. * * This function returns a function that accepts an `Iterable` of values to * transform. When called, this function returns an Iterator of `Vector`. * * The resulting `Iterator>` yields Vectors based on the * `queueingStrategy` and `highWaterMark` specified in the `options` argument. * * * If `queueingStrategy` is `"count"` (or omitted), The `Iterator>` * will flush the underlying `Builder` (and yield a new `Vector`) once the * Builder's `length` reaches or exceeds the supplied `highWaterMark`. * * If `queueingStrategy` is `"bytes"`, the `Iterator>` will flush * the underlying `Builder` (and yield a new `Vector`) once its `byteLength` * reaches or exceeds the supplied `highWaterMark`. * * @param {IterableBuilderOptions} options An object of properties which determine the `Builder` to create and the chunking semantics to use. * @returns A function which accepts a JavaScript `Iterable` of values to * write, and returns an `Iterator` that yields Vectors according * to the chunking semantics defined in the `options` argument. * @nocollapse */ export function builderThroughIterable(options: IterableBuilderOptions) { const { ['queueingStrategy']: queueingStrategy = 'count' } = options; const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? Number.POSITIVE_INFINITY : 2 ** 14 } = options; const sizeProperty: 'length' | 'byteLength' = queueingStrategy !== 'bytes' ? 'length' : 'byteLength'; return function* (source: Iterable) { let numChunks = 0; const builder = makeBuilder(options); for (const value of source) { if (builder.append(value)[sizeProperty] >= highWaterMark) { ++numChunks && (yield builder.toVector()); } } if (builder.finish().length > 0 || numChunks === 0) { yield builder.toVector(); } } as ThroughIterable; } /** @ignore */ type ThroughAsyncIterable = (source: Iterable | AsyncIterable) => AsyncIterableIterator>; /** * Transform an `AsyncIterable` of arbitrary JavaScript values into a * sequence of Arrow Vector following the chunking semantics defined in * the supplied `options` argument. * * This function returns a function that accepts an `AsyncIterable` of values to * transform. When called, this function returns an AsyncIterator of `Vector`. * * The resulting `AsyncIterator>` yields Vectors based on the * `queueingStrategy` and `highWaterMark` specified in the `options` argument. * * * If `queueingStrategy` is `"count"` (or omitted), The `AsyncIterator>` * will flush the underlying `Builder` (and yield a new `Vector`) once the * Builder's `length` reaches or exceeds the supplied `highWaterMark`. * * If `queueingStrategy` is `"bytes"`, the `AsyncIterator>` will flush * the underlying `Builder` (and yield a new `Vector`) once its `byteLength` * reaches or exceeds the supplied `highWaterMark`. * * @param {IterableBuilderOptions} options An object of properties which determine the `Builder` to create and the chunking semantics to use. * @returns A function which accepts a JavaScript `AsyncIterable` of values * to write, and returns an `AsyncIterator` that yields Vectors * according to the chunking semantics defined in the `options` * argument. * @nocollapse */ export function builderThroughAsyncIterable(options: IterableBuilderOptions) { const { ['queueingStrategy']: queueingStrategy = 'count' } = options; const { ['highWaterMark']: highWaterMark = queueingStrategy !== 'bytes' ? Number.POSITIVE_INFINITY : 2 ** 14 } = options; const sizeProperty: 'length' | 'byteLength' = queueingStrategy !== 'bytes' ? 'length' : 'byteLength'; return async function* (source: Iterable | AsyncIterable) { let numChunks = 0; const builder = makeBuilder(options); for await (const value of source) { if (builder.append(value)[sizeProperty] >= highWaterMark) { ++numChunks && (yield builder.toVector()); } } if (builder.finish().length > 0 || numChunks === 0) { yield builder.toVector(); } } as ThroughAsyncIterable; }