1- import { reverse } from "d3-array" ;
1+ import { greatest , reverse } from "d3-array" ;
22import { FileAttachment } from "./fileAttachment.js" ;
33import { isArqueroTable } from "./arquero.js" ;
44import { isArrowTable , loadArrow } from "./arrow.js" ;
@@ -66,13 +66,20 @@ function objectHasEnumerableKeys(value) {
6666}
6767
6868function isQueryResultSetSchema ( schemas ) {
69- return ( Array . isArray ( schemas ) && schemas . every ( ( s ) => s && typeof s . name === "string" ) ) ;
69+ return (
70+ Array . isArray ( schemas ) &&
71+ schemas . every ( isColumnSchema )
72+ ) ;
7073}
7174
7275function isQueryResultSetColumns ( columns ) {
7376 return ( Array . isArray ( columns ) && columns . every ( ( name ) => typeof name === "string" ) ) ;
7477}
7578
79+ function isColumnSchema ( schema ) {
80+ return schema && typeof schema . name === "string" && typeof schema . type === "string" ;
81+ }
82+
7683// Returns true if the value represents an array of primitives (i.e., a
7784// single-column table). This should only be passed values for which
7885// isDataArray returns true.
@@ -191,15 +198,17 @@ function sourceCache(loadSource) {
191198const loadTableDataSource = sourceCache ( async ( source , name ) => {
192199 if ( source instanceof FileAttachment ) {
193200 switch ( source . mimeType ) {
194- case "text/csv" : return source . csv ( { typed : true } ) ;
195- case "text/tab-separated-values" : return source . tsv ( { typed : true } ) ;
201+ case "text/csv" : return source . csv ( ) ;
202+ case "text/tab-separated-values" : return source . tsv ( ) ;
196203 case "application/json" : return source . json ( ) ;
197204 case "application/x-sqlite3" : return source . sqlite ( ) ;
198205 }
199206 if ( / \. ( a r r o w | p a r q u e t ) $ / i. test ( source . name ) ) return loadDuckDBClient ( source , name ) ;
200207 throw new Error ( `unsupported file type: ${ source . mimeType } ` ) ;
201208 }
202209 if ( isArrowTable ( source ) || isArqueroTable ( source ) ) return loadDuckDBClient ( source , name ) ;
210+ if ( isDataArray ( source ) && arrayIsPrimitive ( source ) )
211+ return Array . from ( source , ( value ) => ( { value} ) ) ;
203212 return source ;
204213} ) ;
205214
@@ -542,15 +551,84 @@ export function getTypeValidator(colType) {
542551 }
543552}
544553
554+ // Accepts dates in the form of ISOString and LocaleDateString, with or without time
555+ const DATE_TEST = / ^ ( ( [ - + ] \d { 2 } ) ? \d { 4 } ( - \d { 2 } ( - \d { 2 } ) ) | ( \d { 1 , 2 } ) \/ ( \d { 1 , 2 } ) \/ ( \d { 2 , 4 } ) ) ( [ T ] \d { 2 } : \d { 2 } ( : \d { 2 } ( \. \d { 3 } ) ? ) ? ( Z | [ - + ] \d { 2 } : \d { 2 } ) ? ) ? $ / ;
556+
557+ export function coerceToType ( value , type ) {
558+ switch ( type ) {
559+ case "string" :
560+ return typeof value === "string" || value == null ? value : String ( value ) ;
561+ case "boolean" :
562+ if ( typeof value === "string" ) {
563+ const trimValue = value . trim ( ) . toLowerCase ( ) ;
564+ return trimValue === "true"
565+ ? true
566+ : trimValue === "false"
567+ ? false
568+ : null ;
569+ }
570+ return typeof value === "boolean" || value == null
571+ ? value
572+ : Boolean ( value ) ;
573+ case "bigint" :
574+ return typeof value === "bigint" || value == null
575+ ? value
576+ : Number . isInteger ( typeof value === "string" && ! value . trim ( ) ? NaN : + value )
577+ ? BigInt ( value ) // eslint-disable-line no-undef
578+ : undefined ;
579+ case "integer" : // not a target type for coercion, but can be inferred
580+ case "number" : {
581+ return typeof value === "number"
582+ ? value
583+ : value == null || ( typeof value === "string" && ! value . trim ( ) )
584+ ? NaN
585+ : Number ( value ) ;
586+ }
587+ case "date" : {
588+ if ( value instanceof Date || value == null ) return value ;
589+ if ( typeof value === "number" ) return new Date ( value ) ;
590+ const trimValue = String ( value ) . trim ( ) ;
591+ if ( typeof value === "string" && ! trimValue ) return null ;
592+ return new Date ( DATE_TEST . test ( trimValue ) ? trimValue : NaN ) ;
593+ }
594+ case "array" :
595+ case "object" :
596+ case "buffer" :
597+ case "other" :
598+ return value ;
599+ default :
600+ throw new Error ( `Unable to coerce to type: ${ type } ` ) ;
601+ }
602+ }
603+
545604// This function applies table cell operations to an in-memory table (array of
546605// objects); it should be equivalent to the corresponding SQL query. TODO Use
547606// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
548607// function to do table operations on in-memory data?
549608export function __table ( source , operations ) {
550609 const input = source ;
551610 let { schema, columns} = source ;
552- let primitive = arrayIsPrimitive ( source ) ;
553- if ( primitive ) source = Array . from ( source , ( value ) => ( { value} ) ) ;
611+ let inferredSchema = false ;
612+ if ( ! isQueryResultSetSchema ( schema ) ) {
613+ schema = inferSchema ( source , columns ) ;
614+ inferredSchema = true ;
615+ }
616+ // Combine column types from schema with user-selected types in operations
617+ const types = new Map ( schema . map ( ( { name, type} ) => [ name , type ] ) ) ;
618+ if ( operations . type ) {
619+ for ( const { name, type} of operations . type ) {
620+ types . set ( name , type ) ;
621+ // update schema with user-selected type
622+ if ( schema === input . schema ) schema = schema . slice ( ) ; // copy on write
623+ const colIndex = schema . findIndex ( ( col ) => col . name === name ) ;
624+ if ( colIndex > - 1 ) schema [ colIndex ] = { ...schema [ colIndex ] , type} ;
625+ }
626+ source = source . map ( d => coerceRow ( d , types , schema ) ) ;
627+ } else if ( inferredSchema ) {
628+ // Coerce data according to new schema, unless that happened due to
629+ // operations.type, above.
630+ source = source . map ( d => coerceRow ( d , types , schema ) ) ;
631+ }
554632 for ( const { type, operands} of operations . filter ) {
555633 const [ { value : column } ] = operands ;
556634 const values = operands . slice ( 1 ) . map ( ( { value} ) => value ) ;
@@ -663,7 +741,7 @@ export function __table(source, operations) {
663741 Object . fromEntries ( operations . select . columns . map ( ( c ) => [ c , d [ c ] ] ) )
664742 ) ;
665743 }
666- if ( ! primitive && operations . names ) {
744+ if ( operations . names ) {
667745 const overridesByName = new Map ( operations . names . map ( ( n ) => [ n . column , n ] ) ) ;
668746 if ( schema ) {
669747 schema = schema . map ( ( s ) => {
@@ -684,10 +762,120 @@ export function __table(source, operations) {
684762 } ) )
685763 ) ;
686764 }
687- if ( primitive ) source = source . map ( ( d ) => d . value ) ;
688765 if ( source !== input ) {
689766 if ( schema ) source . schema = schema ;
690767 if ( columns ) source . columns = columns ;
691768 }
692769 return source ;
693770}
771+
772+ function coerceRow ( object , types , schema ) {
773+ const coerced = { } ;
774+ for ( const col of schema ) {
775+ const type = types . get ( col . name ) ;
776+ const value = object [ col . name ] ;
777+ coerced [ col . name ] = type === "raw" ? value : coerceToType ( value , type ) ;
778+ }
779+ return coerced ;
780+ }
781+
782+ function createTypeCount ( ) {
783+ return {
784+ boolean : 0 ,
785+ integer : 0 ,
786+ number : 0 ,
787+ date : 0 ,
788+ string : 0 ,
789+ array : 0 ,
790+ object : 0 ,
791+ bigint : 0 ,
792+ buffer : 0 ,
793+ defined : 0
794+ } ;
795+ }
796+
797+ // Caution: the order below matters! 🌶️ The first one that passes the ≥90% test
798+ // should be the one that we chose, and therefore these types should be listed
799+ // from most specific to least specific.
800+ const types = [
801+ "boolean" ,
802+ "integer" ,
803+ "number" ,
804+ "date" ,
805+ "bigint" ,
806+ "array" ,
807+ "object" ,
808+ "buffer"
809+ // Note: "other" and "string" are intentionally omitted; see below!
810+ ] ;
811+
812+ // We need to show *all* keys present in the array of Objects
813+ function getAllKeys ( rows ) {
814+ const keys = new Set ( ) ;
815+ for ( const row of rows ) {
816+ // avoid crash if row is null or undefined
817+ if ( row ) {
818+ // only enumerable properties
819+ for ( const key in row ) {
820+ // only own properties
821+ if ( Object . prototype . hasOwnProperty . call ( row , key ) ) {
822+ // unique properties, in the order they appear
823+ keys . add ( key ) ;
824+ }
825+ }
826+ }
827+ }
828+ return Array . from ( keys ) ;
829+ }
830+
831+ export function inferSchema ( source , columns = getAllKeys ( source ) ) {
832+ const schema = [ ] ;
833+ const sampleSize = 100 ;
834+ const sample = source . slice ( 0 , sampleSize ) ;
835+ const typeCounts = { } ;
836+ for ( const col of columns ) {
837+ const colCount = typeCounts [ col ] = createTypeCount ( ) ;
838+ for ( const d of sample ) {
839+ let value = d [ col ] ;
840+ if ( value == null ) continue ;
841+ const type = typeof value ;
842+ if ( type !== "string" ) {
843+ ++ colCount . defined ;
844+ if ( Array . isArray ( value ) ) ++ colCount . array ;
845+ else if ( value instanceof Date ) ++ colCount . date ;
846+ else if ( value instanceof ArrayBuffer ) ++ colCount . buffer ;
847+ else if ( type === "number" ) {
848+ ++ colCount . number ;
849+ if ( Number . isInteger ( value ) ) ++ colCount . integer ;
850+ }
851+ // bigint, boolean, or object
852+ else if ( type in colCount ) ++ colCount [ type ] ;
853+ } else {
854+ value = value . trim ( ) ;
855+ if ( ! value ) continue ;
856+ ++ colCount . defined ;
857+ ++ colCount . string ;
858+ if ( / ^ ( t r u e | f a l s e ) $ / i. test ( value ) ) {
859+ ++ colCount . boolean ;
860+ } else if ( value && ! isNaN ( value ) ) {
861+ ++ colCount . number ;
862+ if ( Number . isInteger ( + value ) ) ++ colCount . integer ;
863+ } else if ( DATE_TEST . test ( value ) ) ++ colCount . date ;
864+ }
865+ }
866+ // Chose the non-string, non-other type with the greatest count that is also
867+ // ≥90%; or if no such type meets that criterion, fallback to string if
868+ // ≥90%; and lastly fallback to other.
869+ const minCount = Math . max ( 1 , colCount . defined * 0.9 ) ;
870+ const type =
871+ greatest ( types , ( type ) =>
872+ colCount [ type ] >= minCount ? colCount [ type ] : NaN
873+ ) ?? ( colCount . string >= minCount ? "string" : "other" ) ;
874+ schema . push ( {
875+ name : col ,
876+ type : type ,
877+ inferred : type
878+ } ) ;
879+ }
880+ return schema ;
881+ }
0 commit comments