From fe25bf61e5f9956fed4dd6dd302b79a69d5ba75b Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Fri, 21 Jun 2019 01:56:57 -0700 Subject: [PATCH 1/6] Fixed build errors resulting from upgrade to VS2019 compilers --- src/Native/CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Native/CMakeLists.txt b/src/Native/CMakeLists.txt index a814277e91..0c8e184517 100644 --- a/src/Native/CMakeLists.txt +++ b/src/Native/CMakeLists.txt @@ -8,6 +8,14 @@ set(RESOURCES) include_directories("${CMAKE_BINARY_DIR}/../../") if(WIN32) + # Clobber and reset the default C and CXX flags because + # CMake uses /ZI (Edit and Continue) for generating pdbs + # which is incompatible with the /guard:cf flag we set below + # for security. So we use the default flags set by CMake + # and reset /ZI with /Zi + set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1 /JMC") + set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1 /JMC") + add_definitions(-DWIN32) add_definitions(-D_WIN32=1) add_definitions(-DUNICODE -D_UNICODE) @@ -22,7 +30,7 @@ if(WIN32) add_compile_options($<$:/MT>) add_compile_options($<$:/MT>) add_compile_options(/guard:cf) - add_compile_options(/d2Zi+) # make optimized builds debugging easier + add_compile_options(/Zo) # make optimized builds debugging easier. /Zo is the newer documented flag. add_compile_options(/nologo) # Suppress Startup Banner add_compile_options(/W3) # set warning level to 3 add_compile_options(/WX) # treat warnings as errors From cb446be57b5debba61f711ed55159e16ce575a1f Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Fri, 21 Jun 2019 09:44:22 -0700 Subject: [PATCH 2/6] Added additional message describing the previous fix --- src/Native/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Native/CMakeLists.txt b/src/Native/CMakeLists.txt index 0c8e184517..96345141b7 100644 --- a/src/Native/CMakeLists.txt +++ b/src/Native/CMakeLists.txt @@ -13,6 +13,9 @@ if(WIN32) # which is incompatible with the /guard:cf flag we set below # for security. So we use the default flags set by CMake # and reset /ZI with /Zi + message("CMAKE_C_FLAGS_DEBUG is ${CMAKE_C_FLAGS_DEBUG}") + message("CMAKE_CXX_FLAGS_DEBUG is ${CMAKE_CXX_FLAGS_DEBUG}") + message("In a future version, If the default compiler flags no longer contain the /ZI flag, delete this message block and the two lines below.") set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1 /JMC") set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1 /JMC") From 0b55903da603fa2a595e62c0f382c19327c5b014 Mon Sep 17 00:00:00 2001 From: Harish Kulkarni Date: Thu, 29 Aug 2019 21:25:45 -0700 Subject: [PATCH 3/6] Syncing upstream fork (#10) * Throw error on incorrect Label name in InferColumns API (#47) * Added sequential grouping of columns * reverted the file * addded infer columns label name checking * added column detection error * removed unsed usings * added quotes * replace Where with Any clause * replace Where with Any clause * Set Nullable Auto params to null values (#50) * Added sequential grouping of columns * reverted the file * added auto params as null * change to the update fields method * First public api propsal (#52) * Includes following 1) Final proposal for 0.1 public API surface 2) Prefeaturization 3) Splitting train data into train and validate when validation data is null 4) Providing end to end samples one each for regression, binaryclassification and multiclass classification * Incorporating code review feedbacks * Revert "Set Nullable Auto params to null values" (#53) * Revert "First public api propsal (#52)" This reverts commit e4a64cf4aeab13ee9e5bf0efe242da3270241bd7. * Revert "Set Nullable Auto params to null values (#50)" This reverts commit 41c663cd14247d44022f40cf2dce5977dbab282d. * AutoFit return type is now an IEnumerable (#55) AutoFit returns is now an IEnumerable - this enables many good things Implementing variety of early stopping criteria (See sample) Early discard of models that are no good. This improves memory usage efficiency. (See sample) No need to implement a callback to get results back Getting best score is now outside of API implementation. It is a simple math function to compare scores (See sample). Also templatized the return type for better type safety through out the code. * misc fixes & test additions, towards 0.1 release (#56) * Enable UnitTests on build server (#57) * 1) Making trainer name public (#62) 2) Fixing up samples to reflect it * Initial version of CLI tool for mlnet (#61) * added global tool initial project * removed unneccesary files, renamed files * refactoring and added base abstract classes for trainer generator * removed unused class * Added classes for transforms * added transform generate dummy classes * more refactoring, added first transform * more refactoring and added classes * changed the project structure * restructing added options class * sln changes * refactored options to different class: * added more logic for code generation of class * misc changes * reverted file * added commandline api package * reverted sample * added new command line api parser * added normalization of column names * Added command defaults and error message * implementation of all trainers * changed auto to null * added all transform generators * added error handling when args is empty and minor changes due to change in AutoML api names * changed the name of param * added new command line options and restructuring code * renamed proj file and added solution * Added code to generate usings, Fixed few bugs in the code * added validation to the command line options * changed project name * Bug fixes due to API change in AutoML * changed directory structure * added test framework and basic tests * added more tests * added improvements to template and error handling * renamed the estimator name * fixed test case * added comments * added headers * changed namespace and removed unneccesary properties from project * Revert "changed namespace and removed unneccesary properties from project" This reverts commit 9edae033e9845e910f663f296e168f1182b84f5f. * fixed test cases and renamed namespaces * cleaned up proj file * added folder structure * added symbols/tokens for strings * added more tests * review comments * modified test cases * review comments * change in the exception message * normalized line endings * made method private static * simplified range building /optimization * minor fix * added header * added static methods in command where necessary * nit picks * made few methods static * review comments * nitpick * remove line pragmas * fix test case * Use better AutiFit overload and ignore Multiclass (#64) * Upgrading CLI to produce ML.NET V.10 APIs and bunch of Refactoring tasks (#65) * Added sequential grouping of columns * reverted the file * upgrade to v .10 and refactoring * added null check * fixed unit tests * review comments * removed the settings change * added regions * fixed unit tests * Upgrade ML.NET package to 0.10.0 (#70) * Change in template to accomodate new API of TextLoader (#72) * Added sequential grouping of columns * reverted the file * changed to new API of Text Loader * changed signature * added params for taking additional settings * changes to codegen params * refactoring of templates and fixing errors * Enable gated check for mlnet.tests (#79) * Added sequential grouping of columns * reverted the file * changed to new API of Text Loader * changed signature * added params for taking additional settings * changes to codegen params * refactoring of templates and fixing errors * added run-tests.proj and referred it in build.proj * CLI tool - make validation dataset optional and support for crossvalidation in generated code (#83) * Added sequential grouping of columns * reverted the file * bug fixes, more logic to templates to support cross-validate * formatting and fix type in consolehelper * Added logic in templates * revert settings * benchmarking related changes (#63) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * fix fast forest learner (don't sweep over learning rate) (#88) * Made changes to Have non-calibrated scoring for binary classifiers (#86) * Added sequential grouping of columns * reverted the file * added calibration workaround * removed print probability * reverted settings * rev ColumnInference API: can take label index; rev output object types; add tests (#89) * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (#99) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (will probably need tweaks once I try to use the pipleline) * publish nuget (#101) * use dotnet-internal-temp agent for internal build * use dotnet-internal feed * Fix Codegen for columnConvert and ValueToKeyMapping transform and add individual transform tests (#95) * Added sequential grouping of columns * reverted the file * fix usings for type convert * added transforms tests * review comments * When generating usings choose only distinct usings directives (#94) * Added sequential grouping of columns * reverted the file * Added code to have unique strings * refactoring * minor fix * minor fix * Autofit overloads + cancellation + progress callbacks 1) Introduce AutoFit overloads (basic and advanced) 2) AutoFit Cancellation 3) AutoFit progress callbacks * Default the kfolds to value 5 in CLI generated code (#115) * Added sequential grouping of columns * reverted the file * Set up CI with Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * remove file * added kfold param and defaulted to value * changed type * added for regression * Remove extra ; from generated code (#114) * Added sequential grouping of columns * reverted the file * Set up CI with Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * removed extra ; from generated code * removed file * fix unit tests * TimeoutInSeconds (#116) Specifying timeout in seconds instead of minutes * Added more command line args implementation to CLI tool and refactoring (#110) * Added sequential grouping of columns * reverted the file * Set up CI with Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * added git status * reverted change * added codegen options and refactoring * minor fixes' * renamed params, minor refactoring * added tests for commandline and refactoring * removed file * added back the test case * minor fixes * Update src/mlnet.Test/CommandLineTests.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * review comments * capitalize the first character * changed the name of test case * remove unused directives * Fail gracefully if unable to instantiate data view with swept parameters (#125) * gracefully fail if fail to parse a datai * rev * validate AutoFit 'Features' column must be of type R4 (#132) * Samples: exceptions / nits (#124) * Logging support in CLI + Implementation of cmd args [--name,--output,--verbosity] (#121) * addded logging and helper methods * fixing code after merge * added resx files, added logger framework, added logging messages * added new options * added spacing * minor fixes * change command description * rename option, add headers, include new param in test * formatted * build fix * changed option name * Added NlogConfig file * added back config package * fix tests * added correct validation check (#137) * Use CreateTextLoader(..) instead of CreateTextLoader(..) (#138) * added support to loaddata by class in the generated code * fix tests * changed CreateTextLoader to ReadFromTextFile method. (#140) * changed textloader to readfromtextfile method * formatting * exception fixes (#136) * infer purpose of hidden columns as 'ignore' (#142) * Added approval tests and bunch of refactoring of code and normalizing namespaces (#148) * changed textloader to readfromtextfile method * formatting * added approval tests and refactoring of code * removed few comments * API 2.0 skeleton (#149) Incorporating API review feedback * The CV code should come before the training when there is no test dataset in generated code (#151) * reorder cv code * build fix * fixed structure * Format the generated code + bunch of misc tasks (#152) * added formatting and minor changes for reordering cv * fixing the template * minor changes * formatting changes * fixed approval test * removed unused nuget * added missing value replacing * added test for new transform * fix test * Update src/mlnet/Templates/Console/MLCodeGen.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Sanitize the column names in CLI (#162) * added sanitization layer in CLI * fix test * changed exception.StackTrace to exception.ToString() * fix package name (#168) * Rev public API (#163) * Rename TransformGeneratorBase .cs to TransformGeneratorBase.cs (#153) * Fix minor version for the repository + remove Nlog config package (#171) * changed the minor version * removed the nlog config package * Added new test to columninfo and fixing up API (#178) * Make optimizing metric customizable and add trainer whitelist functionality (#172) * API rev (#181) * propagate root MLContext thru AutoML (instead of creating our own) (#182) * Enabling new command line args (#183) * fix package name * initial commit * added more commandline args * fixed tests * added headers * fix tests * fix test * rename 'AutoFitter' to 'Experiment' (#169) * added tests (#187) * rev InferColumns to accept ColumnInfo input param (#186) * Implement argument --has-header and change usage of dataset (#194) * added has header and fixed dataset and train dataset * fix tests * removed dummy command (#195) * Fix bug for regression and sanitize input label from user (#198) * removed dummy command * sanitize label and fix template * fix tests * Do not generate code concatenating columns when the dataset has a single feature column (#191) * Include some missed logging in the generated code. (#199) * added logging messages for generated code * added log messages * deleted file * cleaning up proj files (#185) * removed platform target * removed platform target * Some spaces and extra lines + bug in output path (#204) * nit picks * nit picks * fix test * accept label from user input and provide in generated code (#205) * Rev handling of weight / label columns (#203) * migrate to private ML.NET nuget for latest bug fixes (#131) * fix multiclass with nonstandard label (#207) * Multiclass nondefault label test (#208) * printing escaped chars + bug (#212) * delete unused internal samples (#211) * fix SMAC bug that causes multiclass sample to infinite loop (#209) * Rev user input validation for new API (#210) * added console message for exit and nit picks (#215) * exit when exception encountered (#216) * Seal API classes (and make EnableCaching internal) (#217) * Suggested sample nits (feel free to ask for any of these to be reverted) (#219) * User input column type validation (#218) * upgrade commandline and renaming (#221) * upgrade commandline and renaming * renaming fields * Make build.sh, init-tools.sh, & run.sh executable on OSX/Linux (#225) * CLI argument descriptions updated (#224) * CLI argument descriptions updated * No version in .csproj * added flag to disable training code (#227) * Exit if perfect model produced (#220) * removed header (#228) * removed header * added auto generated header * removed console read key (#229) * Fix model path in generated file (#230) * removed console read key * fix model path * fix test * reorder samples (#231) * remove rule that infers column purpose as categorical if # of distinct values is < 100 (#233) * Null reference exception fix for finding best model when some runs have failed (#239) * samples fixes (#238) * fix for defaulting Averaged Perceptron # of iterations to 10 (#237) * Bug bash feedback Feb 27. API changes and sample changes (#240) * Bug bash feedback Feb 27. API changes Sample changes Exception fix * Samples / API rev from 2/27 bug bash feedback (#242) * changed the directory structure for generated project (#243) * changed the directory structure for generated project * changed test * upgraded commandline package * Fix test file locations on OSX (#235) * fix test file locations on OSX * changing to Path.Combine() * Additional Path.Combine() * Remove ConsoleCodeGeneratorTests.GeneratedTrainCodeTest.received.txt * Additional Path.Combine() * add back in double comparison fix * remove metrics agent NaN returns * test fix * test format fix * mock out path Thanks to @daholste for additional fixes! * upgrade to latest ML.NET public surface (#246) * Upgrade to ML.NET 0.11 (#247) * initial changes * fix lightgbm * changed normalize method * added tests * fix tests * fix test * Private preview final API changes (#250) * .NET framework design guidelines applied to public surface * WhitelistedTrainers -> Trainers * Add estimator to public API iteration result (#248) * LightGBM pipeline serialization fix (#251) * Change order that we search for TextLoader's parameters (#256) * CLI IFileInfo null exception fix (#254) * Averaged Perceptron pipeline serialization fix (#257) * Upgrade command-line-api and default folder name change (#258) * change in defautl folderName * upgrade command line * Update src/mlnet/Program.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * eliminate IFileInfo from CLI (#260) * Rev samples towards private preview; ignored columns fix (#259) * remove unused methods in consolehelper and nit picks in generated code (#261) * nit picks * change in console helper * fix tests * add space * fix tests * added nuget sources in generated csproj (#262) * added nuget sources in csproj * changed the structure in generated code * space * upgrade to mlnet 0.11 (#263) * Formatting CLI metrics (#264) Ensures space between printed metrics (also model counter). Right aligned metrics. Extended AUC to four digits. * Add implementation of non -ova multi class trainers code gen (#267) * added non ova multi class learners * added tests * test cases * Add caching (#249) * AdvancedExperimentSettings sample nits (#265) * Add sampling key column (#268) * Initial work for multi-class classification support for CLI (#226) * Initial work for multi-class classification support for CLI * String updates * more strings * Whitelist non-OVA multi-class learners * Refactor the orchestration of AutoML calls (#272) * Do not auto-group columns with suggested purpose = 'Ignore' (#273) * Fix: during type inferencing, parse whitespace strings as NaN (#271) * Printing additional metrics in CLI for binary classification (#274) * Printing additional metrics in CLI for binary classification * Update src/mlnet/Utilities/ConsolePrinter.cs * Add API option to store models on disk (instead of in memory); fix IEstimator memory leak (#269) * Print failed iterations in CLI (#275) * change the type to float from double (#277) * cache arg implementation in CLI (#280) * cache implementation * corrected the null case * added tests for all cases * Remove duplicate value-to-key mapping transform for multiclass string labels (#283) * Add post-trainer transform SDK infra; add KeyToValueMapping transform to CLI; fix: for generated multiclass models, convert predicted label from key to original label column type (#286) * Implement ignore columns command line arg (#290) * normalize line endings * added --ignore-columns * null checks * unit tests * Print winning iteration and runtime in CLI (#288) * Print best metric and runtime * Print best metric and runtime * Line endings in AutoMLEngine.cs * Rename time column to duration to match Python SDK * Revert to MicroAccuracy and MacroAccuracy spellings * Revert spelling of BinaryClassificationMetricsAgent to BinaryMetricsAgent to reduce merge conflicts * Revert spelling of MulticlassMetricsAgent to MultiMetricsAgent to reduce merge conflicts * missed some files * Fix merge conflict * Update AutoMLEngine.cs * Add MacOS & Linux to CI; MacOS & Linux test fixes (#293) * MicroAccuracy as default for multi-class (#295) Change default optimization metric for multi-class classification to MicroAccuracy (accuracy). Previously it was set to MacroAccuracy. * Null exception for ignorecolumns in CLI (#294) * Null exception for ignorecolumns in CLI * Check if ignore-columns array has values (as the default is now a empty array) * Emit caching flag in pipeline object model. (Includes SuggestedPipelineBuilder refactor & debug string fixes / refactor) (#296) * removed sln (#297) * Caching enabling in code gen part -2 (#298) * add * added caching codegen * support comma separated values for --ignore-columns (#300) * default initialization for ignore columns (#302) * default initialization * adde null check * Codegen for multiclass non-ova (#303) * changes to template * multicalss codegen * test cases * fix test cases * Generated Project new structure. (#305) * added new templates * writing files to disck * change path * added new templates * misisng braces * fix bugs * format code * added util methods for solution file creation and addition of projects to it * added extra packages to project files * new tests * added correct path for sln * build fix * fix build * include using system in prediction class (#307) * added using * fix test * Random number generator is not thread safe (#310) * Random number generator is not thread safe * Another local random generator * Missed a few references * Referncing AutoMlUtils.random instead of a local RNG * More refs to mail RNG; remove Float as per https://github.com/dotnet/machinelearning/issues/1669 * Missed Random.cs * Fix multiclass code gen (#314) * compile error in codegen * removes scores printing * fix bugs * fix test * Fix compile error in codegen project (#319) * removed redundant code * fix test case * Rev OVA pipeline node SDK output: wrap binary trainers as children inside parent OVA node (#317) * Ova Multi class codegen support (#321) * dummy * multiova implementation * fix tests * remove inclusion list * fix tests and console helper * Rev run result trainer name for OVA: output different trainer name for each OVA + binary learner combination (#322) * Rev run result trainer name for Ova: output different trainer name for each Ova + binary learner combination * test fixes * Console helper bug in generated code for multiclass (#323) * fix * fix test * looping perlogclass * fix test * Initial version of Progress bar impl and CLI UI experience (#325) * progressbar * added progressbar and refactoring * reverted * revert sign assembly * added headers and removed exception rethrow * Setting model directory to temp directory (#327) * Suggested changes to progress bar (#335) * progressbar * added progressbar and refactoring * reverted * revert sign assembly * added headers and removed exception rethrow * bug fixes and updates to UI * added friendly name printing for metric * formatting * Rev Samples (#334) * Telemetry2 (#333) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (will probably need tweaks once I try to use the pipleline) * tweak queue in vsts-ci.yml * CLI telemetry implementation * Telemetry implementation * delete unnecessary file and change file size bucket to actually log log2 instead of nearest ceil value * add headers, remove comments * one more header missing * Fix progress bar in linux/osx (#336) * progressbar * added progressbar and refactoring * reverted * revert sign assembly * added headers and removed exception rethrow * bug fixes and updates to UI * added friendly name printing for metric * formatting * change from task to thread * Update src/mlnet/CodeGenerator/CodeGenerationHelper.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Mem leak fix (#328) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (will probably need tweaks once I try to use the pipleline) * tweak queue in vsts-ci.yml * there is still investigation to be done but this fix works and solves memory leak problems * minor refactor * Upgrade ML.NET package (#343) * Add cross-validation (CV), and auto-CV for small datasets; push common API experiment methods into base class (#287) * restore old yml for internal pipeline so we can publish nuget again to devdiv stream (#344) * Polishing the CLI UI part-1 (#338) * formatting of pbar message * Polishing the UI * optimization * rename variable * Update src/mlnet/AutoML/AutoMLEngine.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Update src/mlnet/CodeGenerator/CodeGenerationHelper.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * new message * changed hhtp to https * added iteration num + 1 * change string name and add color to artifacts * change the message * build errors * added null checks * added exception messsages to log file * added exception messsages to log file * CLI ML.NET version upgrade (#345) * Sample revs; ColumnInformation property name revs; pre-featurizer fixes (#346) * CLI -- consume logs from AutoML SDK (#349) * Rename RunDetails --> RunDetail (#350) * command line api upgrade and progress bar rendering bug (#366) * added fix for all platforms progress bar * upgrade nuget * removed args from writeline * change in the version (#368) * fix few bugs in progressbar and verbosity (#374) * fix few bugs in progressbar and verbosity * removed unused name space * Fix for folders with space in it while generating project (#376) * support for folders with spaces * added support for paths with space * revert file * change name of var * remove spaces * SMAC fix for minimizing metrics (#363) * Formatting Regression metrics and progress bar display days. (#379) * added progress bar day display and fix regression metrics * fix formatting * added total time * formatted total time * change command name and add pbar message (#380) * change command name and add pbar message * fix tests * added aliases * duplicate alias * added another alias for task * UI missing features (#382) * added formatting changes * added accuracy specifically * downgrade the codepages (#384) * Change in project structure (#385) * initial changes * Change in project structure * correcting test * change variable name * fix tests * fix tests * fix more tests * fix codegen errors * adde log file message * changed name of args * change variable names * fix test * FileSizeBuckets in correct units (#387) * Minor telemetry change to log in correct units and make our life easier in the future * Use Ceiling instead of Round * changed order (#388) * prep work to transfer to ml.net (#389) * move test projects to top level test subdir * rename some projects to make naming consistent and make it build again * fix test project refs * Add AutoML components to build, fix issues related to that so it builds * fix test cases, remove AppInsights ref from AutoML (#3329) * [AutoML] disable netfx build leg for now (#3331) * disable netfx build leg for now * disable netfx build leg for now. * [AutoML] Add AutoML XML documentation to all public members; migrate AutoML projects & tests into ML.NET solution; AutoML test fixes (#3351) * [AutoML] Rev AutoML public API; add required native references to AutoML projects (#3364) * [AutoML] Minor changes to generated project in CLI based on feedback (#3371) * nitpicks for generated project * revert back the target framework * [AutoML] Migrate AutoML back to its own solution, w/ NuGet dependencies (#3373) * Migrate AutoML back to its own solution, w/ NuGet dependencies * build project updates; parameter name revert * dummy change * Revert "dummy change" This reverts commit 3e8574266f556a4d5b6805eb55b4d8b8b84cf355. * [AutoML] publish AutoML package (#3383) * publish AutoML package * Only leave automl and mlnet tests to run * publish AutoML package * Only leave automl and mlnet tests to run * fix build issues when ml.net is not building * bump version to 0.3 since that's the one we're going to ship for build (#3416) * [AutoML] temporarily disable all but x64 platforms -- don't want to do native builds and can't find a way around that with the current VSTS pipeline (#3420) * disable steps but keep phases to keep vsts build pipeline happy (#3423) * API docs for experimentation (#3484) * fixed path bug and regression metrics correction (#3504) * changed the casing of option alias as it conflicts with --help (#3554) * [AutoML] Generated project - FastTree nuget package inclusion dynamically (#3567) * added support for fast tree nuget pack inclusion in generated project * fix testcase * changed the tool name in telemetry message * dummy commit * remove space * dummy commit to trigger build * [AutoML] Add AutoML example code (#3458) * AutoML PipelineSuggester: don't recommend pipelines from first-stage trainers that failed (#3593) * InferColumns API: Validate all columns specified in column info exist in inferred data view (#3599) * [AutoML] AutoML SDK API: validate schema types of input IDataView (#3597) * [AutoML] If first three iterations all fail, short-circuit AutoML experiment (#3591) * mlnet CLI nupkg creation/signing (#3606) * mlnet CLI nupkg creation/signing * relmove includeinpackage from mlnet csproj * address PR comments -- some minor reshuffling of stuff * publish symbols for mlnet CLI * fix case in NLog.config * [AutoML] rename Auto to AutoML in namespace and nuget (#3609) * mlnet CLI nupkg creation/signing * [AutoML] take dependency on a specific ml.net version (#3610) * take dependency on a specific ml.net version * catch up to spelling fix for OptimizationTolerance * force a specific ml.net nuget version, fix typo (#3616) * [AutoML] Fix error handling in CLI. (#3618) * fix error handling * renaming variables * [AutoML] turn off line pragmas in .tt files to play nice with signing (#3617) * turn off line pragmas in .tt files to play nice with signing * dedupe tags * change the param name (#3619) * [AutoML] return null instead of null ref crash on Model property accessor (#3620) * return null instead of null ref crash on Model property accessor * [AutoML] Handling label column names which have space and exception logging (#3624) * fix case of label with space and exception logging * final handler * revert file * use Name instead of FullName for telemetry filename hash (#3633) * renamed classes (#3634) * change ML.NET dependency to 1.0 (#3639) [AutoML] undo pinning ML.NET dependency * set exploration time default in CLI to half hour (#3640) * [AutoML] step 2 of removing pinned nupkg versions (#3642) * InferColumns API that consumes label column index -- Only rename label column to 'Label' for headerless files (#3643) * [AutoML] Upgrade ml.net package in generated code (#3644) * upgrade the mlnet package in gen code * Update src/mlnet/Templates/Console/ModelProject.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Update src/mlnet/Templates/Console/ModelProject.tt Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * added spaces * [AutoML] Early stopping in CLI based on the exploration time (#3641) * early stopping in CLI * remove unused variables * change back to thread * remove sleep * fix review comments * remove ununsed usings * format message * collapse declaration * remove unused param * added environment.exit and removal of error message * correction in message * secs-> seconds * exit code * change value to 1 * reverse the declaration * [AutoML] Change wording for CouldNotFinshOnTime message (#3655) * set exploration time default in CLI to half hour * [AutoML] Change wording for CouldNotFinshOnTime message * [AutoML] Change wording for CouldNotFinshOnTime message * even better wording for CouldNotFinshOnTime * temp change to get around vsts publish failure (#3656) * [AutoML] bump version to 0.4.0 (#3658) * implement culture invariant strings (#3725) * reset culture (#3730) * [AutoML] Cross validation fixes; validate empty training / validation input data (#3794) * [AutoML] Enable style cop rules & resolve errors (#3823) * add task agnostic wrappers for autofit calls (#3860) * [AutoML] CLI telemetry rev (#3789) * delete automl .sln * CLI -- regenerate templated CS files (#3954) * [AutoML] Bump ML.NET package version to 1.2.0 in AutoML API and CLI; and AutoML package versions to 0.14.0 (#3958) * Build AutoML NuGet package (#3961) * Increment AutoML build version to 0.15.0 for preview. (#3968) * added culture independent parsing (#3731) * - convert tests to xunit - take project level dependency on ML.NET components instead of nuget - set up bestfriends relationship to ML.Core and remove some of the copies of util classes from AutoML.NET (more work needed to fully remove them, work item 4064) - misc build script changes to address PR comments * address issues only showing up in a couple configurations during CI build * fix cut&paste error * [AutoML] Bump version to ML.NET 1.3.1 in AutoML API and CLI and AutoML package version to 0.15.1 (#4071) * bumped version * change versions in nupkg * revert version bump in branch props * [AutoML] Fix for Exception thrown in cross val when one of the score equals infinity. (#4073) * bumped version * change versions in nupkg * revert version bump in branch props * added infinity fix * changes signing (#4079) * Addressed PR comments and build issues - sync block on creating test data file (failed intermittently) - removed classes we copied over from ML.Core and fixed their uses to de-dupe and use original ML.Core versions since we now have InternalsVisible and BestFriends - Fixed nupkg creation to use projects insted of public nuget version for AutoML - Fixed a bunch of unit tests that didn't actually test what they were supposed to test, while removing cut&past code and dependencies. - Few more misc small changes * minor nit - removed unused folder ref * Fix the .sln file for the right configurations. * Fix mistake in .sln file * test fixes and disable one test * fix tests, re-add AutoML samples csproj * bumped VS version to 16 in .sln, removed InternalsVisible for a dead assembly, removed unused references from AutoML test project * Updated docs to include PredictedLabel member (#4107) * Fixed build errors resulting from upgrade to VS2019 compilers * Added additional message describing the previous fix * Updated docs to include PredictedLabel member * Added CODEOWNERS file in the .github/ folder. (#4140) * Added CODEOWNERS file in the .github/ folder. This allows reviewers to review any changes in the machine learning repository * Updated .github/CODEOWNERS with the team instead of individual reviewers * Added AutoML team reviewers (#4144) * Added CODEOWNERS file in the .github/ folder. This allows reviewers to review any changes in the machine learning repository * Updated .github/CODEOWNERS with the team instead of individual reviewers * Added AutoML team reviwers to files owned by AutoML team * Added AutoML team reviwers to files owned by AutoML team * Removed two files that don't exist for AutoML team in CODEOWNERS * Build extension method to reload changes without specifying model name (#4146) * Image classification preview 2. (#4151) * Image classification preview 2. * PR feedback. * Add unit-test. * Add unit-test. * Add unit-test. * Add unit-test. * Use Path.Combine instead of Join. * fix test dataset path. * fix test dataset path. * Improve test. * Improve test. * Increase epochs in tests. * Disable test on Ubuntu. * Move test to its own project. * Move test to its own project. * Move test to its own project. * Move test to its own file. * cleanup. * Disable parallel execution of tensorflow tests. * PR feedback. * PR feedback. * PR feedback. * PR feedback. * Prevent TF test to execute in parallel. * PR feedback. * Build error. * clean up. --- .github/CODEOWNERS | 11 + Microsoft.ML.sln | 91 +- .../io-columns-anomaly-detection.md | 3 +- .../BinaryClassificationExperiment.cs | 74 + .../DataStructures/PixelData.cs | 14 + .../DataStructures/PixelPrediction.cs | 10 + .../DataStructures/SentimentIssue.cs | 13 + .../DataStructures/SentimentPrediction.cs | 14 + .../DataStructures/TaxiTrip.cs | 28 + .../DataStructures/TaxiTripFarePrediction.cs | 10 + .../Microsoft.ML.AutoML.Samples.csproj | 12 + .../MulticlassClassificationExperiment.cs | 71 + .../Microsoft.ML.AutoML.Samples/Program.cs | 30 + .../RegressionExperiment.cs | 76 + .../InceptionV3TransferLearning.cs | 109 -- .../ResnetV2101TransferLearning.cs | 123 -- ...snetV2101TransferLearningTrainTestSplit.cs | 307 +++++ .../Microsoft.ML.AutoML.nupkgproj | 14 + .../Microsoft.ML.AutoML.symbols.nupkgproj | 3 + .../Builder/BuilderExtensions.cs | 19 + src/Microsoft.ML.AutoML/API/AutoCatalog.cs | 212 +++ .../API/BinaryClassificationExperiment.cs | 170 +++ .../API/ColumnInference.cs | 106 ++ src/Microsoft.ML.AutoML/API/ExperimentBase.cs | 341 +++++ .../CrossValidationExperimentResult.cs | 40 + .../API/ExperimentResults/ExperimentResult.cs | 40 + .../API/ExperimentSettings.cs | 96 ++ .../API/InferenceException.cs | 46 + .../API/MLContextExtension.cs | 22 + .../API/MulticlassClassificationExperiment.cs | 162 +++ src/Microsoft.ML.AutoML/API/Pipeline.cs | 110 ++ .../API/RegressionExperiment.cs | 176 +++ .../RunDetails/CrossValidationRunDetail.cs | 78 ++ .../API/RunDetails/RunDetail.cs | 109 ++ src/Microsoft.ML.AutoML/Assembly.cs | 13 + src/Microsoft.ML.AutoML/AutoMlUtils.cs | 14 + .../ColumnGroupingInference.cs | 151 ++ .../ColumnInference/ColumnInferenceApi.cs | 152 ++ .../ColumnInferenceValidationUtil.cs | 28 + .../ColumnInference/ColumnInformationUtil.cs | 144 ++ .../ColumnInference/ColumnPurpose.cs | 18 + .../ColumnInference/ColumnTypeInference.cs | 414 ++++++ .../ColumnInference/PurposeInference.cs | 283 ++++ .../ColumnInference/TextFileContents.cs | 124 ++ .../ColumnInference/TextFileSample.cs | 304 ++++ .../DatasetDimensions/ColumnDimensions.cs | 18 + .../DatasetDimensions/DatasetDimensionsApi.cs | 50 + .../DatasetDimensionsUtil.cs | 86 ++ .../EstimatorExtensionCatalog.cs | 49 + .../EstimatorExtensions.cs | 272 ++++ .../IEstimatorExtension.cs | 11 + .../Experiment/Experiment.cs | 150 ++ .../MetricsAgents/BinaryMetricsAgent.cs | 86 ++ .../Experiment/MetricsAgents/IMetricsAgent.cs | 15 + .../MetricsAgents/MetricsAgentUtil.cs | 16 + .../MetricsAgents/MultiMetricsAgent.cs | 74 + .../MetricsAgents/RegressionMetricsAgent.cs | 69 + .../Experiment/ModelContainer.cs | 50 + .../Experiment/OptimizingMetricInfo.cs | 44 + .../Experiment/RecipeInference.cs | 29 + .../Experiment/Runners/CrossValRunner.cs | 75 + .../Runners/CrossValSummaryRunner.cs | 102 ++ .../Experiment/Runners/IRunner.cs | 14 + .../Experiment/Runners/RunnerUtil.cs | 60 + .../Experiment/Runners/TrainValidateRunner.cs | 66 + .../Experiment/SuggestedPipeline.cs | 144 ++ .../Experiment/SuggestedPipelineBuilder.cs | 43 + .../SuggestedPipelineCrossValRunDetail.cs | 55 + .../SuggestedPipelineRunDetail.cs | 59 + .../SuggestedPipelineRunDetailUtil.cs | 18 + .../Experiment/SuggestedTrainer.cs | 92 ++ .../Microsoft.ML.AutoML.csproj | 16 + .../PipelineSuggesters/PipelineSuggester.cs | 217 +++ src/Microsoft.ML.AutoML/Sweepers/ISweeper.cs | 273 ++++ .../Sweepers/Parameters.cs | 481 +++++++ src/Microsoft.ML.AutoML/Sweepers/Random.cs | 29 + .../Sweepers/SmacSweeper.cs | 436 ++++++ .../Sweepers/SweeperBase.cs | 78 ++ .../Sweepers/SweeperProbabilityUtils.cs | 160 +++ src/Microsoft.ML.AutoML/TaskKind.cs | 13 + .../Terminators/IterationBasedTerminator.cs | 26 + .../BinaryTrainerExtensions.cs | 235 ++++ .../TrainerExtensions/ITrainerExtension.cs | 20 + .../MultiTrainerExtensions.cs | 232 ++++ .../RegressionTrainerExtensions.cs | 187 +++ .../TrainerExtensions/SweepableParams.cs | 205 +++ .../TrainerExtensionCatalog.cs | 138 ++ .../TrainerExtensions/TrainerExtensionUtil.cs | 382 ++++++ .../TransformInference/TransformInference.cs | 419 ++++++ .../TransformInferenceApi.cs | 22 + .../TransformPostTrainerInference.cs | 43 + .../Utils/BestResultUtil.cs | 114 ++ .../Utils/ColumnTypeExtensions.cs | 36 + .../Utils/DatasetColumnInfo.cs | 41 + src/Microsoft.ML.AutoML/Utils/Logger.cs | 30 + src/Microsoft.ML.AutoML/Utils/SplitUtil.cs | 70 + .../Utils/SweepableParamAttributes.cs | 214 +++ .../Utils/UserInputValidationUtil.cs | 272 ++++ .../Properties/AssemblyInfo.cs | 1 + .../Properties/AssemblyInfo.cs | 1 + .../Properties/AssemblyInfo.cs | 2 + src/Microsoft.ML.Dnn/DnnCatalog.cs | 76 +- src/Microsoft.ML.Dnn/DnnModel.cs | 4 +- ...DnnTransform.cs => DnnRetrainTransform.cs} | 733 ++-------- src/Microsoft.ML.Dnn/DnnUtils.cs | 23 +- .../ImageClassificationTransform.cs | 1222 +++++++++++++++++ .../Properties/AssemblyInfo.cs | 1 + .../TensorflowTransform.cs | 19 +- .../Microsoft.ML.AutoML.Tests/AutoFitTests.cs | 64 + .../BestResultUtilTests.cs | 63 + .../ColumnInferenceTests.cs | 150 ++ .../ColumnInferenceValidationUtilTests.cs | 28 + .../ColumnInformationUtilTests.cs | 57 + .../ConversionTests.cs | 95 ++ .../DatasetDimensionsTests.cs | 86 ++ test/Microsoft.ML.AutoML.Tests/DatasetUtil.cs | 86 ++ .../EstimatorExtensionTests.cs | 53 + .../GetNextPipelineTests.cs | 84 ++ .../InferredPipelineTests.cs | 65 + .../MetricsAgentsTests.cs | 165 +++ test/Microsoft.ML.AutoML.Tests/MetricsUtil.cs | 49 + .../Microsoft.ML.AutoML.Tests.csproj | 35 + .../PurposeInferenceTests.cs | 38 + .../SplitUtilTests.cs | 70 + .../SuggestedPipelineBuilderTests.cs | 83 ++ .../Microsoft.ML.AutoML.Tests/SweeperTests.cs | 170 +++ .../TestData/BinaryDatasetWithBoolColumn.txt | 5 + .../DatasetWithDefaultColumnNames.txt | 4 + .../TestData/DatasetWithEmptyColumn.txt | 4 + .../NameColumnIsOnlyFeatureDataset.txt | 103 ++ .../TestData/TrivialMulticlassDataset.txt | 181 +++ .../TextFileSampleTests.cs | 50 + .../TrainerExtensionsTests.cs | 311 +++++ .../TransformInferenceTests.cs | 757 ++++++++++ .../TransformPostTrainerInferenceTests.cs | 70 + .../UserInputValidationTests.cs | 301 ++++ test/Microsoft.ML.AutoML.Tests/Util.cs | 37 + .../Utils/MLNetUtils/DataViewTestFixture.cs | 62 + .../Utils/MLNetUtils/MLNetUtils.cs | 23 + .../Utils/TaskAgnosticAutoFit.cs | 144 ++ .../Utils/TaskAgnosticIterationResult.cs | 87 ++ .../UnitTests/TestEntryPoints.cs | 30 +- test/Microsoft.ML.Predictor.Tests/Test-API.cs | 42 +- .../TestCreateInstances.cs | 36 +- .../TestCrossValidation.cs | 6 +- .../TestIniModels.cs | 4 +- .../TestPredictors.cs | 76 +- .../TestTrivialPredictors.cs | 8 +- .../Scenarios/TensorflowTests.cs | 99 -- .../TensorflowTests.cs | 274 +++- .../TensorFlowEstimatorTests.cs | 4 + 151 files changed, 15662 insertions(+), 1100 deletions(-) create mode 100644 .github/CODEOWNERS create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/BinaryClassificationExperiment.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelData.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelPrediction.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentIssue.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentPrediction.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTrip.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTripFarePrediction.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/Microsoft.ML.AutoML.Samples.csproj create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/MulticlassClassificationExperiment.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/Program.cs create mode 100644 docs/samples/Microsoft.ML.AutoML.Samples/RegressionExperiment.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/InceptionV3TransferLearning.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearning.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearningTrainTestSplit.cs create mode 100644 pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.nupkgproj create mode 100644 pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.symbols.nupkgproj create mode 100644 src/Microsoft.ML.AutoML/API/AutoCatalog.cs create mode 100644 src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs create mode 100644 src/Microsoft.ML.AutoML/API/ColumnInference.cs create mode 100644 src/Microsoft.ML.AutoML/API/ExperimentBase.cs create mode 100644 src/Microsoft.ML.AutoML/API/ExperimentResults/CrossValidationExperimentResult.cs create mode 100644 src/Microsoft.ML.AutoML/API/ExperimentResults/ExperimentResult.cs create mode 100644 src/Microsoft.ML.AutoML/API/ExperimentSettings.cs create mode 100644 src/Microsoft.ML.AutoML/API/InferenceException.cs create mode 100644 src/Microsoft.ML.AutoML/API/MLContextExtension.cs create mode 100644 src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs create mode 100644 src/Microsoft.ML.AutoML/API/Pipeline.cs create mode 100644 src/Microsoft.ML.AutoML/API/RegressionExperiment.cs create mode 100644 src/Microsoft.ML.AutoML/API/RunDetails/CrossValidationRunDetail.cs create mode 100644 src/Microsoft.ML.AutoML/API/RunDetails/RunDetail.cs create mode 100644 src/Microsoft.ML.AutoML/Assembly.cs create mode 100644 src/Microsoft.ML.AutoML/AutoMlUtils.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/ColumnGroupingInference.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceValidationUtil.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/ColumnPurpose.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/PurposeInference.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs create mode 100644 src/Microsoft.ML.AutoML/ColumnInference/TextFileSample.cs create mode 100644 src/Microsoft.ML.AutoML/DatasetDimensions/ColumnDimensions.cs create mode 100644 src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsApi.cs create mode 100644 src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsUtil.cs create mode 100644 src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensionCatalog.cs create mode 100644 src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensions.cs create mode 100644 src/Microsoft.ML.AutoML/EstimatorExtensions/IEstimatorExtension.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/Experiment.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/MetricsAgents/BinaryMetricsAgent.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/MetricsAgents/IMetricsAgent.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MetricsAgentUtil.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MultiMetricsAgent.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/MetricsAgents/RegressionMetricsAgent.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/ModelContainer.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/OptimizingMetricInfo.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/RecipeInference.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/Runners/CrossValRunner.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/Runners/IRunner.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/Runners/RunnerUtil.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/Runners/TrainValidateRunner.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/SuggestedPipeline.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineBuilder.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineCrossValRunDetail.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetail.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetailUtil.cs create mode 100644 src/Microsoft.ML.AutoML/Experiment/SuggestedTrainer.cs create mode 100644 src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj create mode 100644 src/Microsoft.ML.AutoML/PipelineSuggesters/PipelineSuggester.cs create mode 100644 src/Microsoft.ML.AutoML/Sweepers/ISweeper.cs create mode 100644 src/Microsoft.ML.AutoML/Sweepers/Parameters.cs create mode 100644 src/Microsoft.ML.AutoML/Sweepers/Random.cs create mode 100644 src/Microsoft.ML.AutoML/Sweepers/SmacSweeper.cs create mode 100644 src/Microsoft.ML.AutoML/Sweepers/SweeperBase.cs create mode 100644 src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs create mode 100644 src/Microsoft.ML.AutoML/TaskKind.cs create mode 100644 src/Microsoft.ML.AutoML/Terminators/IterationBasedTerminator.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/BinaryTrainerExtensions.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/ITrainerExtension.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/MultiTrainerExtensions.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/RegressionTrainerExtensions.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/SweepableParams.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionCatalog.cs create mode 100644 src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionUtil.cs create mode 100644 src/Microsoft.ML.AutoML/TransformInference/TransformInference.cs create mode 100644 src/Microsoft.ML.AutoML/TransformInference/TransformInferenceApi.cs create mode 100644 src/Microsoft.ML.AutoML/TransformInference/TransformPostTrainerInference.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/BestResultUtil.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/ColumnTypeExtensions.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/DatasetColumnInfo.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/Logger.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/SplitUtil.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/SweepableParamAttributes.cs create mode 100644 src/Microsoft.ML.AutoML/Utils/UserInputValidationUtil.cs rename src/Microsoft.ML.Dnn/{DnnTransform.cs => DnnRetrainTransform.cs} (64%) create mode 100644 src/Microsoft.ML.Dnn/ImageClassificationTransform.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/BestResultUtilTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/ColumnInferenceValidationUtilTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/ColumnInformationUtilTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/ConversionTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/DatasetDimensionsTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/DatasetUtil.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/EstimatorExtensionTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/GetNextPipelineTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/InferredPipelineTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/MetricsAgentsTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/MetricsUtil.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj create mode 100644 test/Microsoft.ML.AutoML.Tests/PurposeInferenceTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/SplitUtilTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/SuggestedPipelineBuilderTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/SweeperTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/TestData/BinaryDatasetWithBoolColumn.txt create mode 100644 test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithDefaultColumnNames.txt create mode 100644 test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithEmptyColumn.txt create mode 100644 test/Microsoft.ML.AutoML.Tests/TestData/NameColumnIsOnlyFeatureDataset.txt create mode 100644 test/Microsoft.ML.AutoML.Tests/TestData/TrivialMulticlassDataset.txt create mode 100644 test/Microsoft.ML.AutoML.Tests/TextFileSampleTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/TrainerExtensionsTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/TransformInferenceTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/TransformPostTrainerInferenceTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/UserInputValidationTests.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/Util.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/DataViewTestFixture.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/MLNetUtils.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticAutoFit.cs create mode 100644 test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticIterationResult.cs delete mode 100644 test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..4e01115ac0 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,11 @@ +# This file contains the default reviewers for ML .NET code +# For more information on CODEOWNERS file see : https://help.github.com/en/articles/about-code-owners + +# For the entire repository +* @dotnet/mlnet-core + +# Reviewers for files owned by AutoML team +src/Microsoft.ML.AutoML @dotnet/mlnet-automl +test/Microsoft.ML.AutoML.Tests @dotnet/mlnet-automl +pkg/Microsoft.ML.AutoML @dotnet/mlnet-automl +docs/samples/Microsoft.ML.AutoML.Samples @dotnet/mlnet-automl diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 60adfe9dfe..4ba2ae300b 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 15 -VisualStudioVersion = 15.0.27130.2026 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29209.152 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Core", "src\Microsoft.ML.Core\Microsoft.ML.Core.csproj", "{A6CA6CC6-5D7C-4D7F-A0F5-35E14B383B0A}" EndProject @@ -264,6 +264,18 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.StableApi", "t EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Dnn", "src\Microsoft.ML.Dnn\Microsoft.ML.Dnn.csproj", "{4C2D1A8F-7AC1-4036-B5E3-4B31769D73B8}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.AutoML.Tests", "test\Microsoft.ML.AutoML.Tests\Microsoft.ML.AutoML.Tests.csproj", "{C2652287-CD6D-40FB-B042-95FB56D09DB8}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.AutoML", "src\Microsoft.ML.AutoML\Microsoft.ML.AutoML.csproj", "{E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Microsoft.ML.AutoML", "Microsoft.ML.AutoML", "{F5D11F71-2D61-4AE9-99D7-0F0B54649B15}" + ProjectSection(SolutionItems) = preProject + pkg\Microsoft.ML.AutoML\Microsoft.ML.AutoML.nupkgproj = pkg\Microsoft.ML.AutoML\Microsoft.ML.AutoML.nupkgproj + pkg\Microsoft.ML.AutoML\Microsoft.ML.AutoML.symbols.nupkgproj = pkg\Microsoft.ML.AutoML\Microsoft.ML.AutoML.symbols.nupkgproj + EndProjectSection +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.AutoML.Samples", "docs\samples\Microsoft.ML.AutoML.Samples\Microsoft.ML.AutoML.Samples.csproj", "{A6924919-9E37-4023-8B7F-E85C8E3CC9B3}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -1528,6 +1540,77 @@ Global {4C2D1A8F-7AC1-4036-B5E3-4B31769D73B8}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU {4C2D1A8F-7AC1-4036-B5E3-4B31769D73B8}.Release-netfx|x64.ActiveCfg = Release-netfx|Any CPU {4C2D1A8F-7AC1-4036-B5E3-4B31769D73B8}.Release-netfx|x64.Build.0 = Release-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug|x64.ActiveCfg = Debug|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug|x64.Build.0 = Debug|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netcoreapp3_0|Any CPU.ActiveCfg = Debug-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netcoreapp3_0|Any CPU.Build.0 = Debug-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netcoreapp3_0|x64.ActiveCfg = Debug-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netcoreapp3_0|x64.Build.0 = Debug-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netfx|x64.ActiveCfg = Debug-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Debug-netfx|x64.Build.0 = Debug-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release|Any CPU.Build.0 = Release|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release|x64.ActiveCfg = Release|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release|x64.Build.0 = Release|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netcoreapp3_0|Any CPU.ActiveCfg = Release-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netcoreapp3_0|Any CPU.Build.0 = Release-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netcoreapp3_0|x64.ActiveCfg = Release-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netcoreapp3_0|x64.Build.0 = Release-netcoreapp3_0|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netfx|x64.ActiveCfg = Release-netfx|Any CPU + {C2652287-CD6D-40FB-B042-95FB56D09DB8}.Release-netfx|x64.Build.0 = Release-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug|Any CPU.Build.0 = Debug|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug|x64.ActiveCfg = Debug|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug|x64.Build.0 = Debug|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netcoreapp3_0|Any CPU.ActiveCfg = Debug-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netcoreapp3_0|Any CPU.Build.0 = Debug-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netcoreapp3_0|x64.ActiveCfg = Debug-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netcoreapp3_0|x64.Build.0 = Debug-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netfx|x64.ActiveCfg = Debug-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Debug-netfx|x64.Build.0 = Debug-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release|Any CPU.ActiveCfg = Release|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release|Any CPU.Build.0 = Release|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release|x64.ActiveCfg = Release|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release|x64.Build.0 = Release|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netcoreapp3_0|Any CPU.ActiveCfg = Release-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netcoreapp3_0|Any CPU.Build.0 = Release-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netcoreapp3_0|x64.ActiveCfg = Release-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netcoreapp3_0|x64.Build.0 = Release-netcoreapp3_0|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netfx|x64.ActiveCfg = Release-netfx|Any CPU + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80}.Release-netfx|x64.Build.0 = Release-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug|x64.ActiveCfg = Debug|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug|x64.Build.0 = Debug|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netcoreapp3_0|Any CPU.ActiveCfg = Debug-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netcoreapp3_0|Any CPU.Build.0 = Debug-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netcoreapp3_0|x64.ActiveCfg = Debug-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netcoreapp3_0|x64.Build.0 = Debug-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netfx|Any CPU.ActiveCfg = Debug-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netfx|Any CPU.Build.0 = Debug-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netfx|x64.ActiveCfg = Debug-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Debug-netfx|x64.Build.0 = Debug-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release|Any CPU.Build.0 = Release|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release|x64.ActiveCfg = Release|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release|x64.Build.0 = Release|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netcoreapp3_0|Any CPU.ActiveCfg = Release-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netcoreapp3_0|Any CPU.Build.0 = Release-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netcoreapp3_0|x64.ActiveCfg = Release-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netcoreapp3_0|x64.Build.0 = Release-netcoreapp3_0|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netfx|Any CPU.ActiveCfg = Release-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netfx|Any CPU.Build.0 = Release-netfx|Any CPU + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3}.Release-netfx|x64.ActiveCfg = Release-netfx|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1610,6 +1693,10 @@ Global {AE4F7569-26F3-4160-8A8B-7A57D0DA3350} = {D3D38B03-B557-484D-8348-8BADEE4DF592} {F308DC6B-7E59-40D7-A581-834E8CD99CFE} = {7F13E156-3EBA-4021-84A5-CD56BA72F99E} {4C2D1A8F-7AC1-4036-B5E3-4B31769D73B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {C2652287-CD6D-40FB-B042-95FB56D09DB8} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} + {E48285BF-F49A-4EA3-AED0-1BDDBF77EB80} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {F5D11F71-2D61-4AE9-99D7-0F0B54649B15} = {D3D38B03-B557-484D-8348-8BADEE4DF592} + {A6924919-9E37-4023-8B7F-E85C8E3CC9B3} = {DA452A53-2E94-4433-B08C-041EDEC729E6} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/docs/api-reference/io-columns-anomaly-detection.md b/docs/api-reference/io-columns-anomaly-detection.md index 1bcc6a2a9d..ceba1fd4f2 100644 --- a/docs/api-reference/io-columns-anomaly-detection.md +++ b/docs/api-reference/io-columns-anomaly-detection.md @@ -3,4 +3,5 @@ The input features column data must be a known-sized vector of | The non-negative, unbounded score that was calculated by the anomaly detection model.| \ No newline at end of file +| `Score` | | The non-negative, unbounded score that was calculated by the anomaly detection model.| +| `PredictedLabel` | | The predicted label, based on the threshold. A score higher than the threshold maps to `true` and a score lower than the threshold maps to `false`. The default threshold is `0.5`.Use to change the default value.| \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/BinaryClassificationExperiment.cs b/docs/samples/Microsoft.ML.AutoML.Samples/BinaryClassificationExperiment.cs new file mode 100644 index 0000000000..be2988e81e --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/BinaryClassificationExperiment.cs @@ -0,0 +1,74 @@ +using System; +using System.IO; +using System.Linq; +using Microsoft.ML.AutoML; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public static class BinaryClassificationExperiment + { + private static string TrainDataPath = ""; + private static string TestDataPath = ""; + private static string ModelPath = @"\SentimentModel.zip"; + private static uint ExperimentTime = 60; + + public static void Run() + { + MLContext mlContext = new MLContext(); + + // STEP 1: Load data + IDataView trainDataView = mlContext.Data.LoadFromTextFile(TrainDataPath, hasHeader: true); + IDataView testDataView = mlContext.Data.LoadFromTextFile(TestDataPath, hasHeader: true); + + // STEP 2: Run AutoML experiment + Console.WriteLine($"Running AutoML binary classification experiment for {ExperimentTime} seconds..."); + ExperimentResult experimentResult = mlContext.Auto() + .CreateBinaryClassificationExperiment(ExperimentTime) + .Execute(trainDataView); + + // STEP 3: Print metric from the best model + RunDetail bestRun = experimentResult.BestRun; + Console.WriteLine($"Total models produced: {experimentResult.RunDetails.Count()}"); + Console.WriteLine($"Best model's trainer: {bestRun.TrainerName}"); + Console.WriteLine($"Metrics of best model from validation data --"); + PrintMetrics(bestRun.ValidationMetrics); + + // STEP 4: Evaluate test data + IDataView testDataViewWithBestScore = bestRun.Model.Transform(testDataView); + BinaryClassificationMetrics testMetrics = mlContext.BinaryClassification.EvaluateNonCalibrated(testDataViewWithBestScore); + Console.WriteLine($"Metrics of best model on test data --"); + PrintMetrics(testMetrics); + + // STEP 5: Save the best model for later deployment and inferencing + using (FileStream fs = File.Create(ModelPath)) + mlContext.Model.Save(bestRun.Model, trainDataView.Schema, fs); + + // STEP 6: Create prediction engine from the best trained model + var predictionEngine = mlContext.Model.CreatePredictionEngine(bestRun.Model); + + // STEP 7: Initialize a new sentiment issue, and get the predicted sentiment + var testSentimentIssue = new SentimentIssue + { + Text = "I hope this helps." + }; + var prediction = predictionEngine.Predict(testSentimentIssue); + Console.WriteLine($"Predicted sentiment for test issue: {prediction.Prediction}"); + + Console.WriteLine("Press any key to continue..."); + Console.ReadKey(); + } + + private static void PrintMetrics(BinaryClassificationMetrics metrics) + { + Console.WriteLine($"Accuracy: {metrics.Accuracy}"); + Console.WriteLine($"AreaUnderPrecisionRecallCurve: {metrics.AreaUnderPrecisionRecallCurve}"); + Console.WriteLine($"AreaUnderRocCurve: {metrics.AreaUnderRocCurve}"); + Console.WriteLine($"F1Score: {metrics.F1Score}"); + Console.WriteLine($"NegativePrecision: {metrics.NegativePrecision}"); + Console.WriteLine($"NegativeRecall: {metrics.NegativeRecall}"); + Console.WriteLine($"PositivePrecision: {metrics.PositivePrecision}"); + Console.WriteLine($"PositiveRecall: {metrics.PositiveRecall}"); + } + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelData.cs b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelData.cs new file mode 100644 index 0000000000..8c745f97d0 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelData.cs @@ -0,0 +1,14 @@ +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public class PixelData + { + [LoadColumn(0, 63)] + [VectorType(64)] + public float[] PixelValues; + + [LoadColumn(64)] + public float Number; + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelPrediction.cs b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelPrediction.cs new file mode 100644 index 0000000000..0cd8878ec1 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/PixelPrediction.cs @@ -0,0 +1,10 @@ +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public class PixelPrediction + { + [ColumnName("PredictedLabel")] + public float Prediction; + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentIssue.cs b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentIssue.cs new file mode 100644 index 0000000000..1480c5da5d --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentIssue.cs @@ -0,0 +1,13 @@ +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public class SentimentIssue + { + [LoadColumn(0)] + public bool Label { get; set; } + + [LoadColumn(1)] + public string Text { get; set; } + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentPrediction.cs b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentPrediction.cs new file mode 100644 index 0000000000..ac77d77c4e --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/SentimentPrediction.cs @@ -0,0 +1,14 @@ +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public class SentimentPrediction + { + // ColumnName attribute is used to change the column name from + // its default value, which is the name of the field. + [ColumnName("PredictedLabel")] + public bool Prediction { get; set; } + + public float Score { get; set; } + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTrip.cs b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTrip.cs new file mode 100644 index 0000000000..9a7ed798d8 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTrip.cs @@ -0,0 +1,28 @@ +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public class TaxiTrip + { + [LoadColumn(0)] + public string VendorId; + + [LoadColumn(1)] + public float RateCode; + + [LoadColumn(2)] + public float PassengerCount; + + [LoadColumn(3)] + public float TripTimeInSeconds; + + [LoadColumn(4)] + public float TripDistance; + + [LoadColumn(5)] + public string PaymentType; + + [LoadColumn(6)] + public float FareAmount; + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTripFarePrediction.cs b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTripFarePrediction.cs new file mode 100644 index 0000000000..fde03d24b2 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/DataStructures/TaxiTripFarePrediction.cs @@ -0,0 +1,10 @@ +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public class TaxiTripFarePrediction + { + [ColumnName("Score")] + public float FareAmount; + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/Microsoft.ML.AutoML.Samples.csproj b/docs/samples/Microsoft.ML.AutoML.Samples/Microsoft.ML.AutoML.Samples.csproj new file mode 100644 index 0000000000..c54b5d3cf8 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/Microsoft.ML.AutoML.Samples.csproj @@ -0,0 +1,12 @@ + + + + Exe + netcoreapp2.1 + + + + + + + diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/MulticlassClassificationExperiment.cs b/docs/samples/Microsoft.ML.AutoML.Samples/MulticlassClassificationExperiment.cs new file mode 100644 index 0000000000..4fb2ec1073 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/MulticlassClassificationExperiment.cs @@ -0,0 +1,71 @@ +using System; +using System.IO; +using System.Linq; +using Microsoft.ML.AutoML; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public static class MulticlassClassificationExperiment + { + private static string TrainDataPath = ""; + private static string TestDataPath = ""; + private static string ModelPath = @"\OptDigitsModel.zip"; + private static string LabelColumnName = "Number"; + private static uint ExperimentTime = 60; + + public static void Run() + { + MLContext mlContext = new MLContext(); + + // STEP 1: Load data + IDataView trainDataView = mlContext.Data.LoadFromTextFile(TrainDataPath, separatorChar: ','); + IDataView testDataView = mlContext.Data.LoadFromTextFile(TestDataPath, separatorChar: ','); + + // STEP 2: Run AutoML experiment + Console.WriteLine($"Running AutoML multiclass classification experiment for {ExperimentTime} seconds..."); + ExperimentResult experimentResult = mlContext.Auto() + .CreateMulticlassClassificationExperiment(ExperimentTime) + .Execute(trainDataView, LabelColumnName); + + // STEP 3: Print metric from the best model + RunDetail bestRun = experimentResult.BestRun; + Console.WriteLine($"Total models produced: {experimentResult.RunDetails.Count()}"); + Console.WriteLine($"Best model's trainer: {bestRun.TrainerName}"); + Console.WriteLine($"Metrics of best model from validation data --"); + PrintMetrics(bestRun.ValidationMetrics); + + // STEP 4: Evaluate test data + IDataView testDataViewWithBestScore = bestRun.Model.Transform(testDataView); + MulticlassClassificationMetrics testMetrics = mlContext.MulticlassClassification.Evaluate(testDataViewWithBestScore, labelColumnName: LabelColumnName); + Console.WriteLine($"Metrics of best model on test data --"); + PrintMetrics(testMetrics); + + // STEP 5: Save the best model for later deployment and inferencing + using (FileStream fs = File.Create(ModelPath)) + mlContext.Model.Save(bestRun.Model, trainDataView.Schema, fs); + + // STEP 6: Create prediction engine from the best trained model + var predictionEngine = mlContext.Model.CreatePredictionEngine(bestRun.Model); + + // STEP 7: Initialize new pixel data, and get the predicted number + var testPixelData = new PixelData + { + PixelValues = new float[] { 0, 0, 1, 8, 15, 10, 0, 0, 0, 3, 13, 15, 14, 14, 0, 0, 0, 5, 10, 0, 10, 12, 0, 0, 0, 0, 3, 5, 15, 10, 2, 0, 0, 0, 16, 16, 16, 16, 12, 0, 0, 1, 8, 12, 14, 8, 3, 0, 0, 0, 0, 10, 13, 0, 0, 0, 0, 0, 0, 11, 9, 0, 0, 0 } + }; + var prediction = predictionEngine.Predict(testPixelData); + Console.WriteLine($"Predicted number for test pixels: {prediction.Prediction}"); + + Console.WriteLine("Press any key to continue..."); + Console.ReadKey(); + } + + private static void PrintMetrics(MulticlassClassificationMetrics metrics) + { + Console.WriteLine($"LogLoss: {metrics.LogLoss}"); + Console.WriteLine($"LogLossReduction: {metrics.LogLossReduction}"); + Console.WriteLine($"MacroAccuracy: {metrics.MacroAccuracy}"); + Console.WriteLine($"MicroAccuracy: {metrics.MicroAccuracy}"); + } + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs b/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs new file mode 100644 index 0000000000..115764bd4f --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs @@ -0,0 +1,30 @@ +using System; + +namespace Microsoft.ML.AutoML.Samples +{ + public class Program + { + public static void Main(string[] args) + { + try + { + RegressionExperiment.Run(); + Console.Clear(); + + BinaryClassificationExperiment.Run(); + Console.Clear(); + + MulticlassClassificationExperiment.Run(); + Console.Clear(); + + Console.WriteLine("Done"); + } + catch (Exception ex) + { + Console.WriteLine($"Exception {ex}"); + } + + Console.ReadLine(); + } + } +} diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/RegressionExperiment.cs b/docs/samples/Microsoft.ML.AutoML.Samples/RegressionExperiment.cs new file mode 100644 index 0000000000..5c6d3b3195 --- /dev/null +++ b/docs/samples/Microsoft.ML.AutoML.Samples/RegressionExperiment.cs @@ -0,0 +1,76 @@ +using System; +using System.IO; +using System.Linq; +using Microsoft.ML.AutoML; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Samples +{ + public static class RegressionExperiment + { + private static string TrainDataPath = ""; + private static string TestDataPath = ""; + private static string ModelPath = @"\TaxiFareModel.zip"; + private static string LabelColumnName = "FareAmount"; + private static uint ExperimentTime = 60; + + public static void Run() + { + MLContext mlContext = new MLContext(); + + // STEP 1: Load data + IDataView trainDataView = mlContext.Data.LoadFromTextFile(TrainDataPath, hasHeader: true, separatorChar: ','); + IDataView testDataView = mlContext.Data.LoadFromTextFile(TestDataPath, hasHeader: true, separatorChar: ','); + + // STEP 2: Run AutoML experiment + Console.WriteLine($"Running AutoML regression experiment for {ExperimentTime} seconds..."); + ExperimentResult experimentResult = mlContext.Auto() + .CreateRegressionExperiment(ExperimentTime) + .Execute(trainDataView, LabelColumnName); + + // STEP 3: Print metric from best model + RunDetail bestRun = experimentResult.BestRun; + Console.WriteLine($"Total models produced: {experimentResult.RunDetails.Count()}"); + Console.WriteLine($"Best model's trainer: {bestRun.TrainerName}"); + Console.WriteLine($"Metrics of best model from validation data --"); + PrintMetrics(bestRun.ValidationMetrics); + + // STEP 5: Evaluate test data + IDataView testDataViewWithBestScore = bestRun.Model.Transform(testDataView); + RegressionMetrics testMetrics = mlContext.Regression.Evaluate(testDataViewWithBestScore, labelColumnName: LabelColumnName); + Console.WriteLine($"Metrics of best model on test data --"); + PrintMetrics(testMetrics); + + // STEP 6: Save the best model for later deployment and inferencing + using (FileStream fs = File.Create(ModelPath)) + mlContext.Model.Save(bestRun.Model, trainDataView.Schema, fs); + + // STEP 7: Create prediction engine from the best trained model + var predictionEngine = mlContext.Model.CreatePredictionEngine(bestRun.Model); + + // STEP 8: Initialize a new test taxi trip, and get the predicted fare + var testTaxiTrip = new TaxiTrip + { + VendorId = "VTS", + RateCode = 1, + PassengerCount = 1, + TripTimeInSeconds = 1140, + TripDistance = 3.75f, + PaymentType = "CRD" + }; + var prediction = predictionEngine.Predict(testTaxiTrip); + Console.WriteLine($"Predicted fare for test taxi trip: {prediction.FareAmount}"); + + Console.WriteLine("Press any key to continue..."); + Console.ReadKey(); + } + + private static void PrintMetrics(RegressionMetrics metrics) + { + Console.WriteLine($"MeanAbsoluteError: {metrics.MeanAbsoluteError}"); + Console.WriteLine($"MeanSquaredError: {metrics.MeanSquaredError}"); + Console.WriteLine($"RootMeanSquaredError: {metrics.RootMeanSquaredError}"); + Console.WriteLine($"RSquared: {metrics.RSquared}"); + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/InceptionV3TransferLearning.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/InceptionV3TransferLearning.cs deleted file mode 100644 index 0e1ec80973..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/InceptionV3TransferLearning.cs +++ /dev/null @@ -1,109 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.Transforms; - -namespace Samples.Dynamic -{ - public static class InceptionV3TransferLearning - { - /// - /// Example use of Image classification API in a ML.NET pipeline. - /// - public static void Example() - { - var mlContext = new MLContext(seed: 1); - - var imagesDataFile = Path.GetDirectoryName( - Microsoft.ML.SamplesUtils.DatasetUtils.DownloadImages()); - - var data = mlContext.Data.LoadFromEnumerable( - ImageNetData.LoadImagesFromDirectory(imagesDataFile, 4)); - - data = mlContext.Data.ShuffleRows(data, 5); - var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Label") - .Append(mlContext.Transforms.LoadImages("ImageObject", null, - "ImagePath")) - .Append(mlContext.Transforms.ResizeImages("Image", - inputColumnName: "ImageObject", imageWidth: 299, - imageHeight: 299)) - .Append(mlContext.Transforms.ExtractPixels("Image", - interleavePixelColors: true)) - .Append(mlContext.Model.ImageClassification("Image", - "Label", arch: DnnEstimator.Architecture.InceptionV3, epoch: 4, - batchSize: 4)); - - var trainedModel = pipeline.Fit(data); - var predicted = trainedModel.Transform(data); - var metrics = mlContext.MulticlassClassification.Evaluate(predicted); - - Console.WriteLine($"Micro-accuracy: {metrics.MicroAccuracy}," + - $"macro-accuracy = {metrics.MacroAccuracy}"); - - // Create prediction function and test prediction - var predictFunction = mlContext.Model - .CreatePredictionEngine(trainedModel); - - var prediction = predictFunction - .Predict(ImageNetData.LoadImagesFromDirectory(imagesDataFile, 4) - .First()); - - Console.WriteLine($"Scores : [{string.Join(",", prediction.Score)}], " + - $"Predicted Label : {prediction.PredictedLabel}"); - - } - } - - public class ImageNetData - { - [LoadColumn(0)] - public string ImagePath; - - [LoadColumn(1)] - public string Label; - - public static IEnumerable LoadImagesFromDirectory( - string folder, int repeat = 1, bool useFolderNameasLabel = false) - { - var files = Directory.GetFiles(folder, "*", - searchOption: SearchOption.AllDirectories); - - foreach (var file in files) - { - if (Path.GetExtension(file) != ".jpg") - continue; - - var label = Path.GetFileName(file); - if (useFolderNameasLabel) - label = Directory.GetParent(file).Name; - else - { - for (int index = 0; index < label.Length; index++) - { - if (!char.IsLetter(label[index])) - { - label = label.Substring(0, index); - break; - } - } - } - - for (int index = 0; index < repeat; index++) - yield return new ImageNetData() { - ImagePath = file,Label = label }; - } - } - } - - public class ImagePrediction - { - [ColumnName("Score")] - public float[] Score; - - [ColumnName("PredictedLabel")] - public Int64 PredictedLabel; - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearning.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearning.cs deleted file mode 100644 index 9d3136b01b..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearning.cs +++ /dev/null @@ -1,123 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.Data; -using Microsoft.ML.Transforms; - -namespace Samples.Dynamic -{ - public static class ResnetV2101TransferLearning - { - /// - /// Example use of Image classification API in a ML.NET pipeline. - /// - public static void Example() - { - var mlContext = new MLContext(seed: 1); - - var imagesDataFile = Path.GetDirectoryName( - Microsoft.ML.SamplesUtils.DatasetUtils.DownloadImages()); - - var data = mlContext.Data.LoadFromEnumerable( - ImageNetData.LoadImagesFromDirectory(imagesDataFile, 4)); - - data = mlContext.Data.ShuffleRows(data, 5); - var pipeline = mlContext.Transforms.Conversion.MapValueToKey("Label") - .Append(mlContext.Transforms.LoadImages("ImageObject", null, - "ImagePath")) - .Append(mlContext.Transforms.ResizeImages("Image", - inputColumnName: "ImageObject", imageWidth: 299, - imageHeight: 299)) - .Append(mlContext.Transforms.ExtractPixels("Image", - interleavePixelColors: true)) - .Append(mlContext.Model.ImageClassification("Image", - "Label", arch: DnnEstimator.Architecture.ResnetV2101, epoch: 4, - batchSize: 4)); - - var trainedModel = pipeline.Fit(data); - var predicted = trainedModel.Transform(data); - var metrics = mlContext.MulticlassClassification.Evaluate(predicted); - - Console.WriteLine($"Micro-accuracy: {metrics.MicroAccuracy}," + - $"macro-accuracy = {metrics.MacroAccuracy}"); - - mlContext.Model.Save(trainedModel, data.Schema, "model.zip"); - - ITransformer loadedModel; - using (var file = File.OpenRead("model.zip")) - loadedModel = mlContext.Model.Load(file, out DataViewSchema schema); - - // Create prediction function and test prediction - var predictFunction = mlContext.Model - .CreatePredictionEngine(loadedModel); - - var prediction = predictFunction - .Predict(ImageNetData.LoadImagesFromDirectory(imagesDataFile, 4) - .First()); - - Console.WriteLine($"Scores : [{string.Join(",", prediction.Score)}], " + - $"Predicted Label : {prediction.PredictedLabel}"); - } - - private const int imageHeight = 224; - private const int imageWidth = 224; - private const int numChannels = 3; - private const int inputSize = imageHeight * imageWidth * numChannels; - - public class ImageNetData - { - [LoadColumn(0)] - public string ImagePath; - - [LoadColumn(1)] - public string Label; - - public static IEnumerable LoadImagesFromDirectory( - string folder, int repeat = 1, bool useFolderNameasLabel = false) - { - var files = Directory.GetFiles(folder, "*", - searchOption: SearchOption.AllDirectories); - - foreach (var file in files) - { - if (Path.GetExtension(file) != ".jpg") - continue; - - var label = Path.GetFileName(file); - if (useFolderNameasLabel) - label = Directory.GetParent(file).Name; - else - { - for (int index = 0; index < label.Length; index++) - { - if (!char.IsLetter(label[index])) - { - label = label.Substring(0, index); - break; - } - } - } - - for (int index = 0; index < repeat; index++) - yield return new ImageNetData() - { - ImagePath = file, - Label = label - }; - } - } - } - - public class ImagePrediction - { - [ColumnName("Score")] - public float[] Score; - - [ColumnName("PredictedLabel")] - public Int64 PredictedLabel; - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearningTrainTestSplit.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearningTrainTestSplit.cs new file mode 100644 index 0000000000..ed3f4da5a4 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ImageClassification/ResnetV2101TransferLearningTrainTestSplit.cs @@ -0,0 +1,307 @@ + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Microsoft.ML; +using Microsoft.ML.Transforms; +using static Microsoft.ML.DataOperationsCatalog; +using System.Linq; +using Microsoft.ML.Data; +using System.IO.Compression; +using System.Threading; +using System.Net; + +namespace Samples.Dynamic +{ + public class ResnetV2101TransferLearningTrainTestSplit + { + public static void Example() + { + string assetsRelativePath = @"../../../assets"; + string assetsPath = GetAbsolutePath(assetsRelativePath); + + var outputMlNetModelFilePath = Path.Combine(assetsPath, "outputs", + "imageClassifier.zip"); + + string imagesDownloadFolderPath = Path.Combine(assetsPath, "inputs", + "images"); + + //Download the image set and unzip + string finalImagesFolderName = DownloadImageSet( + imagesDownloadFolderPath); + + string fullImagesetFolderPath = Path.Combine( + imagesDownloadFolderPath, finalImagesFolderName); + + try + { + + MLContext mlContext = new MLContext(seed: 1); + + //Load all the original images info + IEnumerable images = LoadImagesFromDirectory( + folder: fullImagesetFolderPath, useFolderNameasLabel: true); + + IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows( + mlContext.Data.LoadFromEnumerable(images)); + + shuffledFullImagesDataset = mlContext.Transforms.Conversion + .MapValueToKey("Label") + .Fit(shuffledFullImagesDataset) + .Transform(shuffledFullImagesDataset); + + // Split the data 90:10 into train and test sets, train and evaluate. + TrainTestData trainTestData = mlContext.Data.TrainTestSplit( + shuffledFullImagesDataset, testFraction: 0.1, seed: 1); + + IDataView trainDataset = trainTestData.TrainSet; + IDataView testDataset = trainTestData.TestSet; + + var pipeline = mlContext.Model.ImageClassification( + "ImagePath", "Label", + // Just by changing/selecting InceptionV3 here instead of + // ResnetV2101 you can try a different architecture/pre-trained + // model. + arch: ImageClassificationEstimator.Architecture.ResnetV2101, + epoch: 50, + batchSize: 10, + learningRate: 0.01f, + metricsCallback: (metrics) => Console.WriteLine(metrics), + validationSet: testDataset); + + + Console.WriteLine("*** Training the image classification model with " + + "DNN Transfer Learning on top of the selected pre-trained " + + "model/architecture ***"); + + // Measuring training time + var watch = System.Diagnostics.Stopwatch.StartNew(); + + var trainedModel = pipeline.Fit(trainDataset); + + watch.Stop(); + long elapsedMs = watch.ElapsedMilliseconds; + + Console.WriteLine("Training with transfer learning took: " + + (elapsedMs / 1000).ToString() + " seconds"); + + mlContext.Model.Save(trainedModel, shuffledFullImagesDataset.Schema, + "model.zip"); + + ITransformer loadedModel; + DataViewSchema schema; + using (var file = File.OpenRead("model.zip")) + loadedModel = mlContext.Model.Load(file, out schema); + + EvaluateModel(mlContext, testDataset, loadedModel); + + VBuffer> keys = default; + loadedModel.GetOutputSchema(schema)["Label"].GetKeyValues(ref keys); + + watch = System.Diagnostics.Stopwatch.StartNew(); + TrySinglePrediction(fullImagesetFolderPath, mlContext, loadedModel, + keys.DenseValues().ToArray()); + + watch.Stop(); + elapsedMs = watch.ElapsedMilliseconds; + + Console.WriteLine("Prediction engine took: " + + (elapsedMs / 1000).ToString() + " seconds"); + } + catch (Exception ex) + { + Console.WriteLine(ex.ToString()); + } + + Console.WriteLine("Press any key to finish"); + Console.ReadKey(); + } + + private static void TrySinglePrediction(string imagesForPredictions, + MLContext mlContext, ITransformer trainedModel, + ReadOnlyMemory[] originalLabels) + { + // Create prediction function to try one prediction + var predictionEngine = mlContext.Model + .CreatePredictionEngine(trainedModel); + + IEnumerable testImages = LoadImagesFromDirectory( + imagesForPredictions, false); + + ImageData imageToPredict = new ImageData + { + ImagePath = testImages.First().ImagePath + }; + + var prediction = predictionEngine.Predict(imageToPredict); + var index = prediction.PredictedLabel; + + Console.WriteLine($"ImageFile : " + + $"[{Path.GetFileName(imageToPredict.ImagePath)}], " + + $"Scores : [{string.Join(",", prediction.Score)}], " + + $"Predicted Label : {originalLabels[index]}"); + } + + + private static void EvaluateModel(MLContext mlContext, + IDataView testDataset, ITransformer trainedModel) + { + Console.WriteLine("Making bulk predictions and evaluating model's " + + "quality..."); + + // Measuring time + var watch2 = System.Diagnostics.Stopwatch.StartNew(); + + IDataView predictions = trainedModel.Transform(testDataset); + var metrics = mlContext.MulticlassClassification.Evaluate(predictions); + + Console.WriteLine($"Micro-accuracy: {metrics.MicroAccuracy}," + + $"macro-accuracy = {metrics.MacroAccuracy}"); + + watch2.Stop(); + long elapsed2Ms = watch2.ElapsedMilliseconds; + + Console.WriteLine("Predicting and Evaluation took: " + + (elapsed2Ms / 1000).ToString() + " seconds"); + } + + public static IEnumerable LoadImagesFromDirectory(string folder, + bool useFolderNameasLabel = true) + { + var files = Directory.GetFiles(folder, "*", + searchOption: SearchOption.AllDirectories); + + foreach (var file in files) + { + if (Path.GetExtension(file) != ".jpg") + continue; + + var label = Path.GetFileName(file); + if (useFolderNameasLabel) + label = Directory.GetParent(file).Name; + else + { + for (int index = 0; index < label.Length; index++) + { + if (!char.IsLetter(label[index])) + { + label = label.Substring(0, index); + break; + } + } + } + + yield return new ImageData() + { + ImagePath = file, + Label = label + }; + + } + } + + public static string DownloadImageSet(string imagesDownloadFolder) + { + // get a set of images to teach the network about the new classes + + //SINGLE SMALL FLOWERS IMAGESET (200 files) + string fileName = "flower_photos_small_set.zip"; + string url = $"https://mlnetfilestorage.file.core.windows.net/" + + $"imagesets/flower_images/flower_photos_small_set.zip?st=2019-08-" + + $"07T21%3A27%3A44Z&se=2030-08-08T21%3A27%3A00Z&sp=rl&sv=2018-03-" + + $"28&sr=f&sig=SZ0UBX47pXD0F1rmrOM%2BfcwbPVob8hlgFtIlN89micM%3D"; + + Download(url, imagesDownloadFolder, fileName); + UnZip(Path.Combine(imagesDownloadFolder, fileName), imagesDownloadFolder); + + return Path.GetFileNameWithoutExtension(fileName); + } + + public static bool Download(string url, string destDir, string destFileName) + { + if (destFileName == null) + destFileName = url.Split(Path.DirectorySeparatorChar).Last(); + + Directory.CreateDirectory(destDir); + + string relativeFilePath = Path.Combine(destDir, destFileName); + + if (File.Exists(relativeFilePath)) + { + Console.WriteLine($"{relativeFilePath} already exists."); + return false; + } + + var wc = new WebClient(); + Console.WriteLine($"Downloading {relativeFilePath}"); + var download = Task.Run(() => wc.DownloadFile(url, relativeFilePath)); + while (!download.IsCompleted) + { + Thread.Sleep(1000); + Console.Write("."); + } + Console.WriteLine(""); + Console.WriteLine($"Downloaded {relativeFilePath}"); + + return true; + } + + public static void UnZip(String gzArchiveName, String destFolder) + { + var flag = gzArchiveName.Split(Path.DirectorySeparatorChar) + .Last() + .Split('.') + .First() + ".bin"; + + if (File.Exists(Path.Combine(destFolder, flag))) return; + + Console.WriteLine($"Extracting."); + var task = Task.Run(() => + { + ZipFile.ExtractToDirectory(gzArchiveName, destFolder); + }); + + while (!task.IsCompleted) + { + Thread.Sleep(200); + Console.Write("."); + } + + File.Create(Path.Combine(destFolder, flag)); + Console.WriteLine(""); + Console.WriteLine("Extracting is completed."); + } + + public static string GetAbsolutePath(string relativePath) + { + FileInfo _dataRoot = new FileInfo(typeof( + ResnetV2101TransferLearningTrainTestSplit).Assembly.Location); + + string assemblyFolderPath = _dataRoot.Directory.FullName; + + string fullPath = Path.Combine(assemblyFolderPath, relativePath); + + return fullPath; + } + + public class ImageData + { + [LoadColumn(0)] + public string ImagePath; + + [LoadColumn(1)] + public string Label; + } + + public class ImagePrediction + { + [ColumnName("Score")] + public float[] Score; + + [ColumnName("PredictedLabel")] + public UInt32 PredictedLabel; + } + } +} + diff --git a/pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.nupkgproj b/pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.nupkgproj new file mode 100644 index 0000000000..aafa391392 --- /dev/null +++ b/pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.nupkgproj @@ -0,0 +1,14 @@ + + + + netstandard2.0 + ML.NET AutoML: Optimizes an ML pipeline for your dataset, by automatically locating the best feature engineering, model, and hyperparameters + + + + + + + + + diff --git a/pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.symbols.nupkgproj b/pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.symbols.nupkgproj new file mode 100644 index 0000000000..a648ab1d59 --- /dev/null +++ b/pkg/Microsoft.ML.AutoML/Microsoft.ML.AutoML.symbols.nupkgproj @@ -0,0 +1,3 @@ + + + diff --git a/src/Microsoft.Extensions.ML/Builder/BuilderExtensions.cs b/src/Microsoft.Extensions.ML/Builder/BuilderExtensions.cs index ebeb366086..45ededdb10 100644 --- a/src/Microsoft.Extensions.ML/Builder/BuilderExtensions.cs +++ b/src/Microsoft.Extensions.ML/Builder/BuilderExtensions.cs @@ -158,6 +158,25 @@ public static PredictionEnginePoolBuilder FromFile + /// Adds the model at the specified file to the builder. + /// + /// The builder to which to add the model. + /// The location of the model. + /// + /// Whether to watch for changes to the file path and update the model when the file is changed or not. + /// + /// + /// The updated . + /// + public static PredictionEnginePoolBuilder FromFile( + this PredictionEnginePoolBuilder builder, string filePath, bool watchForChanges) + where TData : class + where TPrediction : class, new() + { + return builder.FromFile(string.Empty, filePath, watchForChanges); + } + /// /// Adds the model at the specified file to the builder. /// diff --git a/src/Microsoft.ML.AutoML/API/AutoCatalog.cs b/src/Microsoft.ML.AutoML/API/AutoCatalog.cs new file mode 100644 index 0000000000..30a4d7b803 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/AutoCatalog.cs @@ -0,0 +1,212 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// A catalog of all available AutoML tasks. + /// + public sealed class AutoCatalog + { + private readonly MLContext _context; + + internal AutoCatalog(MLContext context) + { + _context = context; + } + + /// + /// Creates a new AutoML experiment to run on a regression dataset. + /// + /// Maximum number of seconds that experiment will run. + /// A new AutoML regression experiment. + /// + /// See for a more detailed code example of an AutoML regression experiment. + /// An experiment may run for longer than . + /// This is because once AutoML starts training an ML.NET model, AutoML lets the + /// model train to completion. For instance, if the first model + /// AutoML trains takes 4 hours, and the second model trained takes 5 hours, + /// but was the number of seconds in 6 hours, + /// the experiment will run for 4 + 5 = 9 hours (not 6 hours). + /// + public RegressionExperiment CreateRegressionExperiment(uint maxExperimentTimeInSeconds) + { + return new RegressionExperiment(_context, new RegressionExperimentSettings() + { + MaxExperimentTimeInSeconds = maxExperimentTimeInSeconds + }); + } + + /// + /// Creates a new AutoML experiment to run on a regression dataset. + /// + /// Settings for the AutoML experiment. + /// A new AutoML regression experiment. + /// + /// See for a more detailed code example of an AutoML regression experiment. + /// + public RegressionExperiment CreateRegressionExperiment(RegressionExperimentSettings experimentSettings) + { + return new RegressionExperiment(_context, experimentSettings); + } + + /// + /// Creates a new AutoML experiment to run on a binary classification dataset. + /// + /// Maximum number of seconds that experiment will run. + /// A new AutoML binary classification experiment. + /// + /// See for a more detailed code example of an AutoML binary classification experiment. + /// An experiment may run for longer than . + /// This is because once AutoML starts training an ML.NET model, AutoML lets the + /// model train to completion. For instance, if the first model + /// AutoML trains takes 4 hours, and the second model trained takes 5 hours, + /// but was the number of seconds in 6 hours, + /// the experiment will run for 4 + 5 = 9 hours (not 6 hours). + /// + public BinaryClassificationExperiment CreateBinaryClassificationExperiment(uint maxExperimentTimeInSeconds) + { + return new BinaryClassificationExperiment(_context, new BinaryExperimentSettings() + { + MaxExperimentTimeInSeconds = maxExperimentTimeInSeconds + }); + } + + /// + /// Creates a new AutoML experiment to run on a binary classification dataset. + /// + /// Settings for the AutoML experiment. + /// A new AutoML binary classification experiment. + /// + /// See for a more detailed code example of an AutoML binary classification experiment. + /// + public BinaryClassificationExperiment CreateBinaryClassificationExperiment(BinaryExperimentSettings experimentSettings) + { + return new BinaryClassificationExperiment(_context, experimentSettings); + } + + /// + /// Creates a new AutoML experiment to run on a multiclass classification dataset. + /// + /// Maximum number of seconds that experiment will run. + /// A new AutoML multiclass classification experiment. + /// + /// See for a more detailed code example of an AutoML multiclass classification experiment. + /// An experiment may run for longer than . + /// This is because once AutoML starts training an ML.NET model, AutoML lets the + /// model train to completion. For instance, if the first model + /// AutoML trains takes 4 hours, and the second model trained takes 5 hours, + /// but was the number of seconds in 6 hours, + /// the experiment will run for 4 + 5 = 9 hours (not 6 hours). + /// + public MulticlassClassificationExperiment CreateMulticlassClassificationExperiment(uint maxExperimentTimeInSeconds) + { + return new MulticlassClassificationExperiment(_context, new MulticlassExperimentSettings() + { + MaxExperimentTimeInSeconds = maxExperimentTimeInSeconds + }); + } + + /// + /// Creates a new AutoML experiment to run on a multiclass classification dataset. + /// + /// Settings for the AutoML experiment. + /// A new AutoML multiclass classification experiment. + /// + /// See for a more detailed code example of an AutoML multiclass classification experiment. + /// + public MulticlassClassificationExperiment CreateMulticlassClassificationExperiment(MulticlassExperimentSettings experimentSettings) + { + return new MulticlassClassificationExperiment(_context, experimentSettings); + } + + /// + /// Infers information about the columns of a dataset in a file located at . + /// + /// Path to a dataset file. + /// The name of the label column. + /// The character used as separator between data elements in a row. If , AutoML will try to infer this value. + /// Whether the file can contain columns defined by a quoted string. If , AutoML will try to infer this value. + /// Whether the file can contain numerical vectors in sparse format. If , AutoML will try to infer this value. + /// Whether trailing whitespace should be removed from dataset file lines. + /// Whether to group together (when possible) original columns in the dataset file into vector columns in the resulting data structures. See for more information. + /// Information inferred about the columns in the provided dataset. + /// + /// Infers information about the name, data type, and purpose of each column. + /// The returned can be used to + /// instantiate a . The can be used to + /// obtain an that can be fed into an AutoML experiment, + /// or used elsewhere in the ML.NET ecosystem (ie in . + /// The contains the inferred purpose of each column in the dataset. + /// (For instance, is the column categorical, numeric, or text data? Should the column be ignored? Etc.) + /// The can be inspected and modified (or kept as is) and used by an AutoML experiment. + /// + public ColumnInferenceResults InferColumns(string path, string labelColumnName = DefaultColumnNames.Label, char? separatorChar = null, bool? allowQuoting = null, + bool? allowSparse = null, bool trimWhitespace = false, bool groupColumns = true) + { + UserInputValidationUtil.ValidateInferColumnsArgs(path, labelColumnName); + return ColumnInferenceApi.InferColumns(_context, path, labelColumnName, separatorChar, allowQuoting, allowSparse, trimWhitespace, groupColumns); + } + + /// + /// Infers information about the columns of a dataset in a file located at . + /// + /// Path to a dataset file. + /// Column information for the dataset. + /// The character used as separator between data elements in a row. If , AutoML will try to infer this value. + /// Whether the file can contain columns defined by a quoted string. If , AutoML will try to infer this value. + /// Whether the file can contain numerical vectors in sparse format. If , AutoML will try to infer this value. + /// Whether trailing whitespace should be removed from dataset file lines. + /// Whether to group together (when possible) original columns in the dataset file into vector columns in the resulting data structures. See for more information. + /// Information inferred about the columns in the provided dataset. + /// + /// Infers information about the name, data type, and purpose of each column. + /// The returned can be used to + /// instantiate a . The can be used to + /// obtain an that can be fed into an AutoML experiment, + /// or used elsewhere in the ML.NET ecosystem (ie in . + /// The contains the inferred purpose of each column in the dataset. + /// (For instance, is the column categorical, numeric, or text data? Should the column be ignored? Etc.) + /// The can be inspected and modified (or kept as is) and used by an AutoML experiment. + /// + public ColumnInferenceResults InferColumns(string path, ColumnInformation columnInformation, char? separatorChar = null, bool? allowQuoting = null, + bool? allowSparse = null, bool trimWhitespace = false, bool groupColumns = true) + { + columnInformation = columnInformation ?? new ColumnInformation(); + UserInputValidationUtil.ValidateInferColumnsArgs(path, columnInformation); + return ColumnInferenceApi.InferColumns(_context, path, columnInformation, separatorChar, allowQuoting, allowSparse, trimWhitespace, groupColumns); + } + + /// + /// Infers information about the columns of a dataset in a file located at . + /// + /// Path to a dataset file. + /// Column index of the label column in the dataset. + /// Whether or not the dataset file has a header row. + /// The character used as separator between data elements in a row. If , AutoML will try to infer this value. + /// Whether the file can contain columns defined by a quoted string. If , AutoML will try to infer this value. + /// Whether the file can contain numerical vectors in sparse format. If , AutoML will try to infer this value. + /// Whether trailing whitespace should be removed from dataset file lines. + /// Whether to group together (when possible) original columns in the dataset file into vector columns in the resulting data structures. See for more information. + /// Information inferred about the columns in the provided dataset. + /// + /// Infers information about the name, data type, and purpose of each column. + /// The returned can be used to + /// instantiate a . The can be used to + /// obtain an that can be fed into an AutoML experiment, + /// or used elsewhere in the ML.NET ecosystem (ie in . + /// The contains the inferred purpose of each column in the dataset. + /// (For instance, is the column categorical, numeric, or text data? Should the column be ignored? Etc.) + /// The can be inspected and modified (or kept as is) and used by an AutoML experiment. + /// + public ColumnInferenceResults InferColumns(string path, uint labelColumnIndex, bool hasHeader = false, char? separatorChar = null, + bool? allowQuoting = null, bool? allowSparse = null, bool trimWhitespace = false, bool groupColumns = true) + { + UserInputValidationUtil.ValidateInferColumnsArgs(path); + return ColumnInferenceApi.InferColumns(_context, path, labelColumnIndex, hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace, groupColumns); + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs b/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs new file mode 100644 index 0000000000..84f539c932 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/BinaryClassificationExperiment.cs @@ -0,0 +1,170 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + /// + /// Settings for AutoML experiments on binary classification datasets. + /// + public sealed class BinaryExperimentSettings : ExperimentSettings + { + /// + /// Metric that AutoML will try to optimize over the course of the experiment. + /// + /// The default value is . + public BinaryClassificationMetric OptimizingMetric { get; set; } + + /// + /// Collection of trainers the AutoML experiment can leverage. + /// + /// The default value is a collection auto-populated with all possible trainers (all values of ). + public ICollection Trainers { get; } + + /// + /// Initializes a new instance of . + /// + public BinaryExperimentSettings() + { + OptimizingMetric = BinaryClassificationMetric.Accuracy; + Trainers = Enum.GetValues(typeof(BinaryClassificationTrainer)).OfType().ToList(); + } + } + + /// + /// Binary classification metric that AutoML will aim to optimize in its sweeping process during an experiment. + /// + public enum BinaryClassificationMetric + { + /// + /// See . + /// + Accuracy, + + /// + /// See . + /// + AreaUnderRocCurve, + + /// + /// See . + /// + AreaUnderPrecisionRecallCurve, + + /// + /// See . + /// + F1Score, + + /// + /// See . + /// + PositivePrecision, + + /// + /// See . + /// + PositiveRecall, + + /// + /// See . + /// + NegativePrecision, + + /// + /// See . + /// + NegativeRecall, + } + + /// + /// Enumeration of ML.NET binary classification trainers used by AutoML. + /// + public enum BinaryClassificationTrainer + { + /// + /// See . + /// + AveragedPerceptron, + + /// + /// See . + /// + FastForest, + + /// + /// See . + /// + FastTree, + + /// + /// See . + /// + LightGbm, + + /// + /// See . + /// + LinearSvm, + + /// + /// See . + /// + LbfgsLogisticRegression, + + /// + /// See . + /// + SdcaLogisticRegression, + + /// + /// See . + /// + SgdCalibrated, + + /// + /// See . + /// + SymbolicSgdLogisticRegression, + } + + /// + /// AutoML experiment on binary classification datasets. + /// + /// + /// + /// + /// + public sealed class BinaryClassificationExperiment : ExperimentBase + { + internal BinaryClassificationExperiment(MLContext context, BinaryExperimentSettings settings) + : base(context, + new BinaryMetricsAgent(context, settings.OptimizingMetric), + new OptimizingMetricInfo(settings.OptimizingMetric), + settings, + TaskKind.BinaryClassification, + TrainerExtensionUtil.GetTrainerNames(settings.Trainers)) + { + } + + private protected override RunDetail GetBestRun(IEnumerable> results) + { + return BestResultUtil.GetBestRun(results, MetricsAgent, OptimizingMetricInfo.IsMaximizing); + } + + private protected override CrossValidationRunDetail GetBestCrossValRun(IEnumerable> results) + { + return BestResultUtil.GetBestRun(results, MetricsAgent, OptimizingMetricInfo.IsMaximizing); + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/ColumnInference.cs b/src/Microsoft.ML.AutoML/API/ColumnInference.cs new file mode 100644 index 0000000000..0c7d834613 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/ColumnInference.cs @@ -0,0 +1,106 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Collections.ObjectModel; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// Contains information AutoML inferred about columns in a dataset. + /// + public sealed class ColumnInferenceResults + { + /// + /// Inferred for the dataset. + /// + /// + /// Can be used to instantiate a new to load + /// data into an . + /// + public TextLoader.Options TextLoaderOptions { get; internal set; } + + /// + /// Information about the inferred columns in the dataset. + /// + /// + /// Contains the inferred purposes of each column. See for more details. + /// This can be fed to the AutoML API when running an experiment. + /// See + /// for example. + /// + public ColumnInformation ColumnInformation { get; internal set; } + } + + /// + /// Information about the columns in a dataset. + /// + /// + /// Contains information about the purpose of each column in the dataset. For instance, + /// it enumerates the dataset columns that AutoML should treat as categorical, + /// the columns AutoML should ignore, which column is the label, etc. + /// can be fed to the AutoML API when running an experiment. + /// See + /// for example. + /// + public sealed class ColumnInformation + { + /// + /// The dataset column to use as the label. + /// + /// The default value is "Label". + public string LabelColumnName { get; set; } + + /// + /// The dataset column to use for example weight. + /// + public string ExampleWeightColumnName { get; set; } + + /// + /// The dataset column to use for grouping rows. + /// If two examples share the same sampling key column name, + /// they are guaranteed to appear in the same subset (train or test). + /// This can be used to ensure no label leakage from the train to the test set. + /// If , no row grouping will be performed. + /// + public string SamplingKeyColumnName { get; set; } + + /// + /// The dataset columns that are categorical. + /// + /// The default value is a new, empty . + /// + /// Categorical data columns should generally be columns that contain a small number of unique values. + /// + public ICollection CategoricalColumnNames { get; } + + /// + /// The dataset columns that are numeric. + /// + /// The default value is a new, empty . + public ICollection NumericColumnNames { get; } + + /// + /// The dataset columns that are text. + /// + /// The default value is a new, empty . + public ICollection TextColumnNames { get; } + + /// + /// The dataset columns that AutoML should ignore. + /// + /// The default value is a new, empty . + public ICollection IgnoredColumnNames { get; } + + public ColumnInformation() + { + LabelColumnName = DefaultColumnNames.Label; + CategoricalColumnNames = new Collection(); + NumericColumnNames = new Collection(); + TextColumnNames = new Collection(); + IgnoredColumnNames = new Collection(); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/API/ExperimentBase.cs b/src/Microsoft.ML.AutoML/API/ExperimentBase.cs new file mode 100644 index 0000000000..5405229d53 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/ExperimentBase.cs @@ -0,0 +1,341 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + /// + /// AutoML experiment base class. All task-specific AutoML experiments + /// (like ) inherit from this class. + /// + /// Metrics type used by task-specific AutoML experiments. + /// Experiment settings type. + public abstract class ExperimentBase + where TMetrics : class + where TExperimentSettings : ExperimentSettings + { + private protected readonly MLContext Context; + private protected readonly IMetricsAgent MetricsAgent; + private protected readonly OptimizingMetricInfo OptimizingMetricInfo; + private protected readonly TExperimentSettings Settings; + + private readonly IChannel _logger; + private readonly TaskKind _task; + private readonly IEnumerable _trainerWhitelist; + + internal ExperimentBase(MLContext context, + IMetricsAgent metricsAgent, + OptimizingMetricInfo optimizingMetricInfo, + TExperimentSettings settings, + TaskKind task, + IEnumerable trainerWhitelist) + { + Context = context; + MetricsAgent = metricsAgent; + OptimizingMetricInfo = optimizingMetricInfo; + Settings = settings; + _logger = ((IChannelProvider)context).Start("AutoML"); + _task = task; + _trainerWhitelist = trainerWhitelist; + } + + /// + /// Executes an AutoML experiment. + /// + /// The training data used by the AutoML experiment. + /// The dataset column used as the label. + /// The dataset column used as the sampling key column. + /// See for more information. + /// Pre-featurizer that AutoML will apply to the data during an + /// experiment. (The pre-featurizer will be fit only on the training data split to produce a + /// trained transform. Then, the trained transform will be applied to both the training + /// data split and corresponding validation data split.) + /// A user-defined object that implements + /// the interface. AutoML will invoke the method + /// after each model it produces during the + /// course of the experiment. + /// + /// The experiment result. + /// + /// Depending on the size of your data, the AutoML experiment could take a long time to execute. + /// + public ExperimentResult Execute(IDataView trainData, string labelColumnName = DefaultColumnNames.Label, + string samplingKeyColumn = null, IEstimator preFeaturizer = null, IProgress> progressHandler = null) + { + var columnInformation = new ColumnInformation() + { + LabelColumnName = labelColumnName, + SamplingKeyColumnName = samplingKeyColumn + }; + return Execute(trainData, columnInformation, preFeaturizer, progressHandler); + } + + /// + /// Executes an AutoML experiment. + /// + /// The training data to be used by the AutoML experiment. + /// Column information for the dataset. + /// Pre-featurizer that AutoML will apply to the data during an + /// experiment. (The pre-featurizer will be fit only on the training data split to produce a + /// trained transform. Then, the trained transform will be applied to both the training + /// data split and corresponding validation data split.) + /// A user-defined object that implements + /// the interface. AutoML will invoke the method + /// after each model it produces during the + /// course of the experiment. + /// + /// The experiment result. + /// + /// Depending on the size of your data, the AutoML experiment could take a long time to execute. + /// + public ExperimentResult Execute(IDataView trainData, ColumnInformation columnInformation, + IEstimator preFeaturizer = null, IProgress> progressHandler = null) + { + // Cross val threshold for # of dataset rows -- + // If dataset has < threshold # of rows, use cross val. + // Else, run experiment using train-validate split. + const int crossValRowCountThreshold = 15000; + + var rowCount = DatasetDimensionsUtil.CountRows(trainData, crossValRowCountThreshold); + + if (rowCount < crossValRowCountThreshold) + { + const int numCrossValFolds = 10; + var splitResult = SplitUtil.CrossValSplit(Context, trainData, numCrossValFolds, columnInformation?.SamplingKeyColumnName); + return ExecuteCrossValSummary(splitResult.trainDatasets, columnInformation, splitResult.validationDatasets, preFeaturizer, progressHandler); + } + else + { + var splitResult = SplitUtil.TrainValidateSplit(Context, trainData, columnInformation?.SamplingKeyColumnName); + return ExecuteTrainValidate(splitResult.trainData, columnInformation, splitResult.validationData, preFeaturizer, progressHandler); + } + } + + /// + /// Executes an AutoML experiment. + /// + /// The training data to be used by the AutoML experiment. + /// The validation data to be used by the AutoML experiment. + /// The name of the label column. + /// Pre-featurizer that AutoML will apply to the data during an + /// experiment. (The pre-featurizer will be fit only on the training data split to produce a + /// trained transform. Then, the trained transform will be applied to both the training + /// data split and corresponding validation data split.) + /// A user-defined object that implements + /// the interface. AutoML will invoke the method + /// after each model it produces during the + /// course of the experiment. + /// + /// The experiment result. + /// + /// Depending on the size of your data, the AutoML experiment could take a long time to execute. + /// + public ExperimentResult Execute(IDataView trainData, IDataView validationData, string labelColumnName = DefaultColumnNames.Label, IEstimator preFeaturizer = null, IProgress> progressHandler = null) + { + var columnInformation = new ColumnInformation() { LabelColumnName = labelColumnName }; + return Execute(trainData, validationData, columnInformation, preFeaturizer, progressHandler); + } + + /// + /// Executes an AutoML experiment. + /// + /// The training data to be used by the AutoML experiment. + /// The validation data to be used by the AutoML experiment. + /// Column information for the dataset. + /// Pre-featurizer that AutoML will apply to the data during an + /// experiment. (The pre-featurizer will be fit only on the training data split to produce a + /// trained transform. Then, the trained transform will be applied to both the training + /// data split and corresponding validation data split.) + /// A user-defined object that implements + /// the interface. AutoML will invoke the method + /// after each model it produces during the + /// course of the experiment. + /// + /// The experiment result. + /// + /// Depending on the size of your data, the AutoML experiment could take a long time to execute. + /// + public ExperimentResult Execute(IDataView trainData, IDataView validationData, ColumnInformation columnInformation, IEstimator preFeaturizer = null, IProgress> progressHandler = null) + { + if (validationData == null) + { + return Execute(trainData, columnInformation, preFeaturizer, progressHandler); + } + return ExecuteTrainValidate(trainData, columnInformation, validationData, preFeaturizer, progressHandler); + } + + /// + /// Executes an AutoML experiment. + /// + /// The training data to be used by the AutoML experiment. + /// The number of cross validation folds into which the training data should be divided when fitting a model. + /// Column information for the dataset. + /// Pre-featurizer that AutoML will apply to the data during an + /// experiment. (The pre-featurizer will be fit only on the training data split to produce a + /// trained transform. Then, the trained transform will be applied to both the training + /// data split and corresponding validation data split.) + /// A user-defined object that implements + /// the interface. AutoML will invoke the method + /// after each model it produces during the + /// course of the experiment. + /// + /// The cross validation experiment result. + /// + /// Depending on the size of your data, the AutoML experiment could take a long time to execute. + /// + public CrossValidationExperimentResult Execute(IDataView trainData, uint numberOfCVFolds, ColumnInformation columnInformation = null, IEstimator preFeaturizer = null, IProgress> progressHandler = null) + { + UserInputValidationUtil.ValidateNumberOfCVFoldsArg(numberOfCVFolds); + var splitResult = SplitUtil.CrossValSplit(Context, trainData, numberOfCVFolds, columnInformation?.SamplingKeyColumnName); + return ExecuteCrossVal(splitResult.trainDatasets, columnInformation, splitResult.validationDatasets, preFeaturizer, progressHandler); + } + + /// + /// Executes an AutoML experiment. + /// + /// The training data to be used by the AutoML experiment. + /// The number of cross validation folds into which the training data should be divided when fitting a model. + /// The name of the label column. + /// The name of the sampling key column. + /// Pre-featurizer that AutoML will apply to the data during an + /// experiment. (The pre-featurizer will be fit only on the training data split to produce a + /// trained transform. Then, the trained transform will be applied to both the training + /// data split and corresponding validation data split.) + /// A user-defined object that implements + /// the interface. AutoML will invoke the method + /// after each model it produces during the + /// course of the experiment. + /// + /// The cross validation experiment result. + /// + /// Depending on the size of your data, the AutoML experiment could take a long time to execute. + /// + public CrossValidationExperimentResult Execute(IDataView trainData, + uint numberOfCVFolds, string labelColumnName = DefaultColumnNames.Label, + string samplingKeyColumn = null, IEstimator preFeaturizer = null, + Progress> progressHandler = null) + { + var columnInformation = new ColumnInformation() + { + LabelColumnName = labelColumnName, + SamplingKeyColumnName = samplingKeyColumn + }; + return Execute(trainData, numberOfCVFolds, columnInformation, preFeaturizer, progressHandler); + } + + private protected abstract CrossValidationRunDetail GetBestCrossValRun(IEnumerable> results); + + private protected abstract RunDetail GetBestRun(IEnumerable> results); + + private ExperimentResult ExecuteTrainValidate(IDataView trainData, + ColumnInformation columnInfo, + IDataView validationData, + IEstimator preFeaturizer, + IProgress> progressHandler) + { + columnInfo = columnInfo ?? new ColumnInformation(); + UserInputValidationUtil.ValidateExperimentExecuteArgs(trainData, columnInfo, validationData, _task); + + // Apply pre-featurizer + ITransformer preprocessorTransform = null; + if (preFeaturizer != null) + { + preprocessorTransform = preFeaturizer.Fit(trainData); + trainData = preprocessorTransform.Transform(trainData); + validationData = preprocessorTransform.Transform(validationData); + } + + var runner = new TrainValidateRunner(Context, trainData, validationData, columnInfo.LabelColumnName, MetricsAgent, + preFeaturizer, preprocessorTransform, _logger); + var columns = DatasetColumnInfoUtil.GetDatasetColumnInfo(Context, trainData, columnInfo); + return Execute(columnInfo, columns, preFeaturizer, progressHandler, runner); + } + + private CrossValidationExperimentResult ExecuteCrossVal(IDataView[] trainDatasets, + ColumnInformation columnInfo, + IDataView[] validationDatasets, + IEstimator preFeaturizer, + IProgress> progressHandler) + { + columnInfo = columnInfo ?? new ColumnInformation(); + UserInputValidationUtil.ValidateExperimentExecuteArgs(trainDatasets[0], columnInfo, validationDatasets[0], _task); + + // Apply pre-featurizer + ITransformer[] preprocessorTransforms = null; + (trainDatasets, validationDatasets, preprocessorTransforms) = ApplyPreFeaturizerCrossVal(trainDatasets, validationDatasets, preFeaturizer); + + var runner = new CrossValRunner(Context, trainDatasets, validationDatasets, MetricsAgent, preFeaturizer, + preprocessorTransforms, columnInfo.LabelColumnName, _logger); + var columns = DatasetColumnInfoUtil.GetDatasetColumnInfo(Context, trainDatasets[0], columnInfo); + + // Execute experiment & get all pipelines run + var experiment = new Experiment, TMetrics>(Context, _task, OptimizingMetricInfo, progressHandler, + Settings, MetricsAgent, _trainerWhitelist, columns, runner, _logger); + var runDetails = experiment.Execute(); + + var bestRun = GetBestCrossValRun(runDetails); + var experimentResult = new CrossValidationExperimentResult(runDetails, bestRun); + return experimentResult; + } + + private ExperimentResult ExecuteCrossValSummary(IDataView[] trainDatasets, + ColumnInformation columnInfo, + IDataView[] validationDatasets, + IEstimator preFeaturizer, + IProgress> progressHandler) + { + columnInfo = columnInfo ?? new ColumnInformation(); + UserInputValidationUtil.ValidateExperimentExecuteArgs(trainDatasets[0], columnInfo, validationDatasets[0], _task); + + // Apply pre-featurizer + ITransformer[] preprocessorTransforms = null; + (trainDatasets, validationDatasets, preprocessorTransforms) = ApplyPreFeaturizerCrossVal(trainDatasets, validationDatasets, preFeaturizer); + + var runner = new CrossValSummaryRunner(Context, trainDatasets, validationDatasets, MetricsAgent, preFeaturizer, + preprocessorTransforms, columnInfo.LabelColumnName, OptimizingMetricInfo, _logger); + var columns = DatasetColumnInfoUtil.GetDatasetColumnInfo(Context, trainDatasets[0], columnInfo); + return Execute(columnInfo, columns, preFeaturizer, progressHandler, runner); + } + + private ExperimentResult Execute(ColumnInformation columnInfo, + DatasetColumnInfo[] columns, + IEstimator preFeaturizer, + IProgress> progressHandler, + IRunner> runner) + { + // Execute experiment & get all pipelines run + var experiment = new Experiment, TMetrics>(Context, _task, OptimizingMetricInfo, progressHandler, + Settings, MetricsAgent, _trainerWhitelist, columns, runner, _logger); + var runDetails = experiment.Execute(); + + var bestRun = GetBestRun(runDetails); + var experimentResult = new ExperimentResult(runDetails, bestRun); + return experimentResult; + } + + private static (IDataView[] trainDatasets, IDataView[] validDatasets, ITransformer[] preprocessorTransforms) + ApplyPreFeaturizerCrossVal(IDataView[] trainDatasets, IDataView[] validDatasets, IEstimator preFeaturizer) + { + if (preFeaturizer == null) + { + return (trainDatasets, validDatasets, null); + } + + var preprocessorTransforms = new ITransformer[trainDatasets.Length]; + for (var i = 0; i < trainDatasets.Length; i++) + { + // Preprocess train and validation data + preprocessorTransforms[i] = preFeaturizer.Fit(trainDatasets[i]); + trainDatasets[i] = preprocessorTransforms[i].Transform(trainDatasets[i]); + validDatasets[i] = preprocessorTransforms[i].Transform(validDatasets[i]); + } + + return (trainDatasets, validDatasets, preprocessorTransforms); + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/ExperimentResults/CrossValidationExperimentResult.cs b/src/Microsoft.ML.AutoML/API/ExperimentResults/CrossValidationExperimentResult.cs new file mode 100644 index 0000000000..5c1db48a59 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/ExperimentResults/CrossValidationExperimentResult.cs @@ -0,0 +1,40 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// Result of an AutoML experiment that includes cross validation details. + /// + /// Metrics type for the experiment (like ). + public class CrossValidationExperimentResult + { + /// + /// Details of the cross validation runs in this experiment. + /// + /// + /// See for more information. + /// + public readonly IEnumerable> RunDetails; + + /// + /// Best run in this experiment. + /// + /// + /// AutoML considers the optimizing metric (like ) + /// when determining the best run. + /// + public readonly CrossValidationRunDetail BestRun; + + internal CrossValidationExperimentResult(IEnumerable> runDetails, + CrossValidationRunDetail bestRun) + { + RunDetails = runDetails; + BestRun = bestRun; + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/ExperimentResults/ExperimentResult.cs b/src/Microsoft.ML.AutoML/API/ExperimentResults/ExperimentResult.cs new file mode 100644 index 0000000000..85eecfdb7f --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/ExperimentResults/ExperimentResult.cs @@ -0,0 +1,40 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// Result of an AutoML experiment. + /// + /// Metrics type for the experiment (like ). + public class ExperimentResult + { + /// + /// Details of the runs in this experiment. + /// + /// + /// See for more information. + /// + public readonly IEnumerable> RunDetails; + + /// + /// Best run in this experiment. + /// + /// + /// AutoML considers the optimizing metric (like ) + /// when determining the best run. + /// + public readonly RunDetail BestRun; + + internal ExperimentResult(IEnumerable> runDetails, + RunDetail bestRun) + { + RunDetails = runDetails; + BestRun = bestRun; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/API/ExperimentSettings.cs b/src/Microsoft.ML.AutoML/API/ExperimentSettings.cs new file mode 100644 index 0000000000..21d08eca30 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/ExperimentSettings.cs @@ -0,0 +1,96 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using System.Threading; + +namespace Microsoft.ML.AutoML +{ + /// + /// Base class for experiment settings. All task-specific AutoML experiment settings + /// (like ) inherit from this class. + /// + public abstract class ExperimentSettings + { + /// + /// Maximum time in seconds the experiment is allowed to run. + /// + /// The default value is 86,400, the number of seconds in one day. + /// + /// An experiment may run for longer than . + /// This is because once AutoML starts training an ML.NET model, AutoML lets the + /// model train to completion. For instance, if the first model + /// AutoML trains takes 4 hours, and the second model trained takes 5 hours, + /// but was the number of seconds in 6 hours, + /// the experiment will run for 4 + 5 = 9 hours (not 6 hours). + /// + public uint MaxExperimentTimeInSeconds { get; set; } + + /// + /// Cancellation token for the AutoML experiment. It propagates the notification + /// that the experiment should be canceled. + /// + /// + /// An experiment may not immediately stop after cancellation. + /// This is because once AutoML starts training an ML.NET model, AutoML lets the + /// model train to completion. For instance, if the first model + /// AutoML trains takes 4 hours, and the second model trained takes 5 hours, + /// but cancellation is requested after 6 hours, + /// the experiment will stop after 4 + 5 = 9 hours (not 6 hours). + /// + public CancellationToken CancellationToken { get; set; } + + /// + /// This is a pointer to a directory where all models trained during the AutoML experiment will be saved. + /// If , models will be kept in memory instead of written to disk. + /// (Please note: for an experiment with high runtime operating on a large dataset, opting to keep models in + /// memory could cause a system to run out of memory.) + /// + /// The default value is the directory named "Microsoft.ML.AutoML" in the current user's temporary folder. + public DirectoryInfo CacheDirectory { get; set; } + + /// + /// Whether AutoML should cache before ML.NET trainers. + /// See for more information on caching. + /// + /// The default value is . + public CacheBeforeTrainer CacheBeforeTrainer { get; set; } + + internal int MaxModels; + + /// + /// Initializes a new instance of . + /// + public ExperimentSettings() + { + MaxExperimentTimeInSeconds = 24 * 60 * 60; + CancellationToken = default; + CacheDirectory = new DirectoryInfo(Path.Combine(Path.GetTempPath(), "Microsoft.ML.AutoML")); + CacheBeforeTrainer = CacheBeforeTrainer.Auto; + MaxModels = int.MaxValue; + } + } + + /// + /// Whether AutoML should cache before ML.NET trainers. + /// See for more information on caching. + /// + public enum CacheBeforeTrainer + { + /// + /// Dynamically determine whether to cache before each trainer. + /// + Auto, + + /// + /// Always force caching on. + /// + On, + + /// + /// Always force caching off. + /// + Off, + } +} diff --git a/src/Microsoft.ML.AutoML/API/InferenceException.cs b/src/Microsoft.ML.AutoML/API/InferenceException.cs new file mode 100644 index 0000000000..667309d7f4 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/InferenceException.cs @@ -0,0 +1,46 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.AutoML +{ + /// + /// Type of exception encountered by AutoML. + /// + public enum InferenceExceptionType + { + /// + /// Exception that occurs when AutoML is inferring the data type of a column. + /// + ColumnDataType, + + /// + /// Exception that occurs when AutoML is attempting to split a dataset into distinct columns. + /// + ColumnSplit, + } + + /// + /// Exception thrown by AutoML. + /// + public sealed class InferenceException : Exception + { + /// + /// Type of AutoML exception that occurred. + /// + public InferenceExceptionType InferenceExceptionType; + + internal InferenceException(InferenceExceptionType inferenceType, string message) + : base(message) + { + } + + internal InferenceException(InferenceExceptionType inferenceType, string message, Exception inner) + : base(message, inner) + { + } + } + +} diff --git a/src/Microsoft.ML.AutoML/API/MLContextExtension.cs b/src/Microsoft.ML.AutoML/API/MLContextExtension.cs new file mode 100644 index 0000000000..da223838e1 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/MLContextExtension.cs @@ -0,0 +1,22 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + /// + /// Class containing AutoML extension methods to + /// + public static class MLContextExtension + { + /// + /// Returns a catalog of all possible AutoML operations. + /// + /// instance. + /// A catalog of all possible AutoML operations. + public static AutoCatalog Auto(this MLContext mlContext) + { + return new AutoCatalog(mlContext); + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs b/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs new file mode 100644 index 0000000000..6f45e6f54a --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/MulticlassClassificationExperiment.cs @@ -0,0 +1,162 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + /// + /// Settings for AutoML experiments on multiclass classification datasets. + /// + public sealed class MulticlassExperimentSettings : ExperimentSettings + { + /// + /// Metric that AutoML will try to optimize over the course of the experiment. + /// + /// The default value is . + public MulticlassClassificationMetric OptimizingMetric { get; set; } + + /// + /// Collection of trainers the AutoML experiment can leverage. + /// + /// + /// The default value is a collection auto-populated with all possible trainers (all values of ). + /// + public ICollection Trainers { get; } + + /// + /// Initializes a new instances of . + /// + public MulticlassExperimentSettings() + { + OptimizingMetric = MulticlassClassificationMetric.MicroAccuracy; + Trainers = Enum.GetValues(typeof(MulticlassClassificationTrainer)).OfType().ToList(); + } + } + + /// + /// Multiclass classification metric that AutoML will aim to optimize in its sweeping process during an experiment. + /// + public enum MulticlassClassificationMetric + { + /// + /// See . + /// + MicroAccuracy, + + /// + /// See . + /// + MacroAccuracy, + + /// + /// See . + /// + LogLoss, + + /// + /// See . + /// + LogLossReduction, + + /// + /// See . + /// + TopKAccuracy, + } + + /// + /// Enumeration of ML.NET multiclass classification trainers used by AutoML. + /// + public enum MulticlassClassificationTrainer + { + /// + /// using . + /// + AveragedPerceptronOva, + + /// + /// using . + /// + FastForestOva, + + /// + /// using . + /// + FastTreeOva, + + /// + /// See . + /// + LightGbm, + + /// + /// using . + /// + LinearSupportVectorMachinesOva, + + /// + /// See . + /// + LbfgsMaximumEntropy, + + /// + /// using . + /// + LbfgsLogisticRegressionOva, + + /// + /// See . + /// + SdcaMaximumEntropy, + + /// + /// using . + /// + SgdCalibratedOva, + + /// + /// using . + /// + SymbolicSgdLogisticRegressionOva, + } + + /// + /// AutoML experiment on multiclass classification datasets. + /// + /// + /// + /// + /// + public sealed class MulticlassClassificationExperiment : ExperimentBase + { + internal MulticlassClassificationExperiment(MLContext context, MulticlassExperimentSettings settings) + : base(context, + new MultiMetricsAgent(context, settings.OptimizingMetric), + new OptimizingMetricInfo(settings.OptimizingMetric), + settings, + TaskKind.MulticlassClassification, + TrainerExtensionUtil.GetTrainerNames(settings.Trainers)) + { + } + + private protected override CrossValidationRunDetail GetBestCrossValRun(IEnumerable> results) + { + return BestResultUtil.GetBestRun(results, MetricsAgent, OptimizingMetricInfo.IsMaximizing); + } + + private protected override RunDetail GetBestRun(IEnumerable> results) + { + return BestResultUtil.GetBestRun(results, MetricsAgent, OptimizingMetricInfo.IsMaximizing); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/API/Pipeline.cs b/src/Microsoft.ML.AutoML/API/Pipeline.cs new file mode 100644 index 0000000000..28674fd2b9 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/Pipeline.cs @@ -0,0 +1,110 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; + +namespace Microsoft.ML.AutoML +{ + internal class Pipeline + { + public PipelineNode[] Nodes { get; set; } + public bool CacheBeforeTrainer { get; set; } + + public Pipeline(PipelineNode[] nodes, bool cacheBeforeTrainer = false) + { + Nodes = nodes; + CacheBeforeTrainer = cacheBeforeTrainer; + } + + // (used by Newtonsoft) + internal Pipeline() + { + } + + public IEstimator ToEstimator(MLContext context) + { + var inferredPipeline = SuggestedPipeline.FromPipeline(context, this); + return inferredPipeline.ToEstimator(); + } + } + + internal class PipelineNode + { + public string Name { get; set; } + public PipelineNodeType NodeType { get; set; } + public string[] InColumns { get; set; } + public string[] OutColumns { get; set; } + public IDictionary Properties { get; set; } + + public PipelineNode(string name, PipelineNodeType nodeType, + string[] inColumns, string[] outColumns, + IDictionary properties = null) + { + Name = name; + NodeType = nodeType; + InColumns = inColumns; + OutColumns = outColumns; + Properties = properties ?? new Dictionary(); + } + + public PipelineNode(string name, PipelineNodeType nodeType, + string inColumn, string outColumn, IDictionary properties = null) : + this(name, nodeType, new string[] { inColumn }, new string[] { outColumn }, properties) + { + } + + public PipelineNode(string name, PipelineNodeType nodeType, + string[] inColumns, string outColumn, IDictionary properties = null) : + this(name, nodeType, inColumns, new string[] { outColumn }, properties) + { + } + + // (used by Newtonsoft) + internal PipelineNode() + { + } + } + + internal enum PipelineNodeType + { + Transform, + Trainer + } + + internal class CustomProperty + { + public string Name { get; set; } + public IDictionary Properties { get; set; } + + public CustomProperty(string name, IDictionary properties) + { + Name = name; + Properties = properties; + } + + internal CustomProperty() + { + } + } + + internal class PipelineScore + { + public readonly double Score; + + /// + /// This setting is true if the pipeline run succeeded and ran to completion. + /// Else, it is false if some exception was thrown before the run could complete. + /// + public readonly bool RunSucceded; + + internal readonly Pipeline Pipeline; + + internal PipelineScore(Pipeline pipeline, double score, bool runSucceeded) + { + Pipeline = pipeline; + Score = score; + RunSucceded = runSucceeded; + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs b/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs new file mode 100644 index 0000000000..438260a1eb --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/RegressionExperiment.cs @@ -0,0 +1,176 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + /// + /// Settings for AutoML experiments on regression datasets. + /// + public sealed class RegressionExperimentSettings : ExperimentSettings + { + /// + /// Metric that AutoML will try to optimize over the course of the experiment. + /// + /// The default value is . + public RegressionMetric OptimizingMetric { get; set; } + + /// + /// Collection of trainers the AutoML experiment can leverage. + /// + /// + /// The default value is a collection auto-populated with all possible trainers (all values of ). + /// + public ICollection Trainers { get; } + + public RegressionExperimentSettings() + { + OptimizingMetric = RegressionMetric.RSquared; + Trainers = Enum.GetValues(typeof(RegressionTrainer)).OfType().ToList(); + } + } + + /// + /// Regression metric that AutoML will aim to optimize in its sweeping process during an experiment. + /// + public enum RegressionMetric + { + /// + /// See . + /// + MeanAbsoluteError, + + /// + /// See . + /// + MeanSquaredError, + + /// + /// See . + /// + RootMeanSquaredError, + + /// + /// See . + /// + RSquared + } + + /// + /// Enumeration of ML.NET multiclass classification trainers used by AutoML. + /// + public enum RegressionTrainer + { + /// + /// See . + /// + FastForest, + + /// + /// See . + /// + FastTree, + + /// + /// See . + /// + FastTreeTweedie, + + /// + /// See . + /// + LightGbm, + + /// + /// See . + /// + OnlineGradientDescent, + + /// + /// See . + /// + Ols, + + /// + /// See . + /// + LbfgsPoissonRegression, + + /// + /// See . + /// + StochasticDualCoordinateAscent, + } + + /// + /// AutoML experiment on regression classification datasets. + /// + /// + /// + /// + /// + public sealed class RegressionExperiment : ExperimentBase + { + internal RegressionExperiment(MLContext context, RegressionExperimentSettings settings) + : base(context, + new RegressionMetricsAgent(context, settings.OptimizingMetric), + new OptimizingMetricInfo(settings.OptimizingMetric), + settings, + TaskKind.Regression, + TrainerExtensionUtil.GetTrainerNames(settings.Trainers)) + { + } + + private protected override CrossValidationRunDetail GetBestCrossValRun(IEnumerable> results) + { + return BestResultUtil.GetBestRun(results, MetricsAgent, OptimizingMetricInfo.IsMaximizing); + } + + private protected override RunDetail GetBestRun(IEnumerable> results) + { + return BestResultUtil.GetBestRun(results, MetricsAgent, OptimizingMetricInfo.IsMaximizing); + } + } + + /// + /// Extension methods that operate over regression experiment run results. + /// + public static class RegressionExperimentResultExtensions + { + /// + /// Select the best run from an enumeration of experiment runs. + /// + /// Enumeration of AutoML experiment run results. + /// Metric to consider when selecting the best run. + /// The best experiment run. + public static RunDetail Best(this IEnumerable> results, RegressionMetric metric = RegressionMetric.RSquared) + { + var metricsAgent = new RegressionMetricsAgent(null, metric); + var isMetricMaximizing = new OptimizingMetricInfo(metric).IsMaximizing; + return BestResultUtil.GetBestRun(results, metricsAgent, isMetricMaximizing); + } + + /// + /// Select the best run from an enumeration of experiment cross validation runs. + /// + /// Enumeration of AutoML experiment cross validation run results. + /// Metric to consider when selecting the best run. + /// The best experiment run. + public static CrossValidationRunDetail Best(this IEnumerable> results, RegressionMetric metric = RegressionMetric.RSquared) + { + var metricsAgent = new RegressionMetricsAgent(null, metric); + var isMetricMaximizing = new OptimizingMetricInfo(metric).IsMaximizing; + return BestResultUtil.GetBestRun(results, metricsAgent, isMetricMaximizing); + } + } +} diff --git a/src/Microsoft.ML.AutoML/API/RunDetails/CrossValidationRunDetail.cs b/src/Microsoft.ML.AutoML/API/RunDetails/CrossValidationRunDetail.cs new file mode 100644 index 0000000000..b83de334c9 --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/RunDetails/CrossValidationRunDetail.cs @@ -0,0 +1,78 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.AutoML +{ + /// + /// Details about a cross validation run in an AutoML experiment. + /// + /// Metrics type for the run. + /// + /// Over the course of an experiment, many models are evaluated on a dataset + /// using cross validation. This object contains information about each model + /// evaluated during the AutoML experiment. + /// + public sealed class CrossValidationRunDetail : RunDetail + { + /// + /// Results for each of the cross validation folds. + /// + public IEnumerable> Results { get; private set; } + + internal CrossValidationRunDetail(string trainerName, + IEstimator estimator, + Pipeline pipeline, + IEnumerable> results) : base(trainerName, estimator, pipeline) + { + Results = results; + } + } + + /// + /// Result of a pipeline trained on a cross validation fold. + /// + /// Metrics type for the run. + public sealed class TrainResult + { + /// + /// Each fold has training data and validation data. A model trained on the + /// folds's training data is evaluated against the validation data, + /// and the metrics for that calculation are emitted here. + /// + public TMetrics ValidationMetrics { get; private set; } + + /// + /// Model trained on the fold during the run. + /// + /// + /// You can use the trained model to obtain predictions on input data. + /// + public ITransformer Model { get { return _modelContainer.GetModel(); } } + + /// + /// Exception encountered while training the fold. This property is + /// if no exception was encountered. + /// + /// + /// If an exception occurred, it's possible some properties in ths object + /// (like ) could be . + /// + public Exception Exception { get; private set; } + + private readonly ModelContainer _modelContainer; + + internal TrainResult(ModelContainer modelContainer, + TMetrics metrics, + Exception exception) + { + _modelContainer = modelContainer; + ValidationMetrics = metrics; + Exception = exception; + } + } + +} diff --git a/src/Microsoft.ML.AutoML/API/RunDetails/RunDetail.cs b/src/Microsoft.ML.AutoML/API/RunDetails/RunDetail.cs new file mode 100644 index 0000000000..bce89ecf1f --- /dev/null +++ b/src/Microsoft.ML.AutoML/API/RunDetails/RunDetail.cs @@ -0,0 +1,109 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// Details about an AutoML experiment run. + /// + /// + /// Over the course of an experiment, many models are evaluated on a dataset. + /// This object contains information about each model evaluated during + /// the AutoML experiment. + /// + /// Metrics type for the experiment (like ). + public sealed class RunDetail : RunDetail + { + /// + /// Metrics of how the trained model performed on the validation data during + /// the run. + /// + /// + /// Internally, each run has train data and validation data. Model trained on the + /// run's training is evaluated against the validation data, + /// and the metrics for that calculation are emitted here. + /// + public TMetrics ValidationMetrics { get; private set; } + + /// + /// Model trained during the run. + /// + /// + /// You can use the trained model to obtain predictions on input data. + /// + public ITransformer Model { get { return _modelContainer?.GetModel(); } } + + /// + /// Exception encountered during the run. This property is if + /// no exception was encountered. + /// + /// + /// If an exception occurred, it's possible some properties in ths object + /// (like ) could be . + /// + public Exception Exception { get; private set; } + + private readonly ModelContainer _modelContainer; + + internal RunDetail(string trainerName, + IEstimator estimator, + Pipeline pipeline, + ModelContainer modelContainer, + TMetrics metrics, + Exception exception) : base(trainerName, estimator, pipeline) + { + _modelContainer = modelContainer; + ValidationMetrics = metrics; + Exception = exception; + } + } + + /// + /// Details about an AutoML experiment run. + /// + /// + /// In trying to produce the best model, an AutoML experiment evaluates the quality of many models + /// on a dataset. This object contains information about each model tried during the AutoML experiment. + /// + public abstract class RunDetail + { + /// + /// String name of the trainer used in this run. (For instance, "LightGbm".) + /// + public string TrainerName { get; private set; } + + /// + /// Runtime in seconds. + /// + /// + /// Runtime includes model training time. Depending on the size of the data, + /// the runtime may be quite long. + /// + public double RuntimeInSeconds { get; internal set; } + + /// + /// An ML.NET that represents the pipeline in this run. + /// + /// + /// You can call on + /// this estimator to re-train your pipeline on any . + /// + public IEstimator Estimator { get; private set; } + + internal Pipeline Pipeline { get; private set; } + internal double PipelineInferenceTimeInSeconds { get; set; } + + internal RunDetail(string trainerName, + IEstimator estimator, + Pipeline pipeline) + { + TrainerName = trainerName; + Estimator = estimator; + Pipeline = pipeline; + } + } +} diff --git a/src/Microsoft.ML.AutoML/Assembly.cs b/src/Microsoft.ML.AutoML/Assembly.cs new file mode 100644 index 0000000000..7a309d29fe --- /dev/null +++ b/src/Microsoft.ML.AutoML/Assembly.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Runtime.CompilerServices; +using Microsoft.ML; + +[assembly: InternalsVisibleTo("Microsoft.ML.AutoML.Tests, PublicKey=002400000480000094000000060200000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a34928e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] +[assembly: InternalsVisibleTo("mlnet, PublicKey=00240000048000009400000006020000002400005253413100040000010001004b86c4cb78549b34bab61a3b1800e23bfeb5b3ec390074041536a7e3cbd97f5f04cf0f857155a8928eaa29ebfd11cfbbad3ba70efea7bda3226c6a8d370a4cd303f714486b6ebc225985a638471e6ef571cc92a4613c00b8fa65d61ccee0cbe5f36330c9a01f4183559f1bef24cc2917c6d913e3a541333a1d05d9bed22b38cb")] +[assembly: InternalsVisibleTo("mlnet.Tests, PublicKey=002400000480000094000000060200000024000052534131000400000100010015c01ae1f50e8cc09ba9eac9147cf8fd9fce2cfe9f8dce4f7301c4132ca9fb50ce8cbf1df4dc18dd4d210e4345c744ecb3365ed327efdbc52603faa5e21daa11234c8c4a73e51f03bf192544581ebe107adee3a34928e39d04e524a9ce729d5090bfd7dad9d10c722c0def9ccc08ff0a03790e48bcd1f9b6c476063e1966a1c4")] +[assembly: InternalsVisibleTo("Benchmark, PublicKey=00240000048000009400000006020000002400005253413100040000010001004b86c4cb78549b34bab61a3b1800e23bfeb5b3ec390074041536a7e3cbd97f5f04cf0f857155a8928eaa29ebfd11cfbbad3ba70efea7bda3226c6a8d370a4cd303f714486b6ebc225985a638471e6ef571cc92a4613c00b8fa65d61ccee0cbe5f36330c9a01f4183559f1bef24cc2917c6d913e3a541333a1d05d9bed22b38cb")] + +[assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.AutoML/AutoMlUtils.cs b/src/Microsoft.ML.AutoML/AutoMlUtils.cs new file mode 100644 index 0000000000..d2f4b30c62 --- /dev/null +++ b/src/Microsoft.ML.AutoML/AutoMlUtils.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Threading; + +namespace Microsoft.ML.AutoML +{ + internal static class AutoMlUtils + { + public static readonly ThreadLocal Random = new ThreadLocal(() => new Random()); + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnGroupingInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnGroupingInference.cs new file mode 100644 index 0000000000..2abec5c000 --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnGroupingInference.cs @@ -0,0 +1,151 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Microsoft.ML.Data; +using static Microsoft.ML.Data.TextLoader; + +namespace Microsoft.ML.AutoML +{ + /// + /// This class incapsulates logic for grouping together the inferred columns of the text file based on their type + /// and purpose, and generating column names. + /// + internal static class ColumnGroupingInference + { + /// + /// This is effectively a merger of and a + /// with support for vector-value columns. + /// + public class GroupingColumn + { + public string SuggestedName; + public DataKind ItemKind; + public ColumnPurpose Purpose; + public Range[] Ranges; + + public GroupingColumn(string name, DataKind kind, ColumnPurpose purpose, Range[] ranges) + { + SuggestedName = name; + ItemKind = kind; + Purpose = purpose; + Ranges = ranges; + } + + public TextLoader.Column GenerateTextLoaderColumn() + { + return new TextLoader.Column(SuggestedName, ItemKind, Ranges); + } + } + + /// + /// Group together the single-valued columns with the same type and purpose and generate column names. + /// + /// The host environment to use. + /// Whether the original file had a header. + /// If yes, the fields are used to generate the column + /// names, otherwise they are ignored. + /// The (detected) column types. + /// The (detected) column purposes. Must be parallel to . + /// The struct containing an array of grouped columns specifications. + public static GroupingColumn[] InferGroupingAndNames(MLContext env, bool hasHeader, ColumnTypeInference.Column[] types, PurposeInference.Column[] purposes) + { + var result = new List(); + var tuples = types.Zip(purposes, Tuple.Create).ToList(); + var grouped = + from t in tuples + group t by + new + { + t.Item1.ItemType, + t.Item2.Purpose, + purposeGroupId = GetPurposeGroupId(t.Item1.ColumnIndex, t.Item2.Purpose) + } + into g + select g; + + foreach (var g in grouped) + { + string name = (hasHeader && g.Count() == 1) + ? g.First().Item1.SuggestedName + : GetName(g.Key.ItemType.GetRawKind().ToDataKind(), g.Key.Purpose, result); + + var ranges = GetRanges(g.Select(t => t.Item1.ColumnIndex).ToArray()); + result.Add(new GroupingColumn(name, g.Key.ItemType.GetRawKind().ToDataKind(), g.Key.Purpose, ranges)); + } + + return result.ToArray(); + } + + private static int GetPurposeGroupId(int columnIndex, ColumnPurpose purpose) + { + if (purpose == ColumnPurpose.CategoricalFeature || + purpose == ColumnPurpose.TextFeature || + purpose == ColumnPurpose.Ignore) + return columnIndex; + return 0; + } + + private static string GetName(DataKind itemKind, ColumnPurpose purpose, List previousColumns) + { + string prefix = GetPurposeName(purpose, itemKind); + int i = 0; + string name = prefix; + while (previousColumns.Any(x => x.SuggestedName == name)) + { + i++; + name = string.Format("{0}{1:00}", prefix, i); + } + + return name; + } + + private static string GetPurposeName(ColumnPurpose purpose, DataKind itemKind) + { + switch (purpose) + { + case ColumnPurpose.NumericFeature: + if (itemKind == DataKind.Boolean) + { + return "BooleanFeatures"; + } + else + { + return "Features"; + } + case ColumnPurpose.CategoricalFeature: + return "Cat"; + default: + return Enum.GetName(typeof(ColumnPurpose), purpose); + } + } + + /// + /// Generates a collection of Ranges from indices. + /// + private static Range[] GetRanges(int[] indices) + { + Array.Sort(indices); + var allRanges = new List(); + var currRange = new Range(indices[0]); + for (int i = 1; i < indices.Length; i++) + { + if (indices[i] == currRange.Max + 1) + { + currRange.Max++; + } + else + { + allRanges.Add(currRange); + currRange = new Range(indices[i]); + } + } + allRanges.Add(currRange); + return allRanges.ToArray(); + } + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs new file mode 100644 index 0000000000..526570f796 --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs @@ -0,0 +1,152 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class ColumnInferenceApi + { + public static ColumnInferenceResults InferColumns(MLContext context, string path, uint labelColumnIndex, + bool hasHeader, char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns) + { + var sample = TextFileSample.CreateFromFullFile(path); + var splitInference = InferSplit(context, sample, separatorChar, allowQuotedStrings, supportSparse); + var typeInference = InferColumnTypes(context, sample, splitInference, hasHeader, labelColumnIndex, null); + + // If no headers, suggest label column name as 'Label' + if (!hasHeader) + { + typeInference.Columns[labelColumnIndex].SuggestedName = DefaultColumnNames.Label; + } + + var columnInfo = new ColumnInformation() { LabelColumnName = typeInference.Columns[labelColumnIndex].SuggestedName }; + + return InferColumns(context, path, columnInfo, hasHeader, splitInference, typeInference, trimWhitespace, groupColumns); + } + + public static ColumnInferenceResults InferColumns(MLContext context, string path, string labelColumn, + char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns) + { + var columnInfo = new ColumnInformation() { LabelColumnName = labelColumn }; + return InferColumns(context, path, columnInfo, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace, groupColumns); + } + + public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, + char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns) + { + var sample = TextFileSample.CreateFromFullFile(path); + var splitInference = InferSplit(context, sample, separatorChar, allowQuotedStrings, supportSparse); + var typeInference = InferColumnTypes(context, sample, splitInference, true, null, columnInfo.LabelColumnName); + return InferColumns(context, path, columnInfo, true, splitInference, typeInference, trimWhitespace, groupColumns); + } + + public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, bool hasHeader, + TextFileContents.ColumnSplitResult splitInference, ColumnTypeInference.InferenceResult typeInference, + bool trimWhitespace, bool groupColumns) + { + var loaderColumns = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns); + var typedLoaderOptions = new TextLoader.Options + { + Columns = loaderColumns, + Separators = new[] { splitInference.Separator.Value }, + AllowSparse = splitInference.AllowSparse, + AllowQuoting = splitInference.AllowQuote, + HasHeader = hasHeader, + TrimWhitespace = trimWhitespace + }; + var textLoader = context.Data.CreateTextLoader(typedLoaderOptions); + var dataView = textLoader.Load(path); + + // Validate all columns specified in column info exist in inferred data view + ColumnInferenceValidationUtil.ValidateSpecifiedColumnsExist(columnInfo, dataView); + + var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, columnInfo); + + // start building result objects + IEnumerable columnResults = null; + IEnumerable<(string, ColumnPurpose)> purposeResults = null; + + // infer column grouping and generate column names + if (groupColumns) + { + var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader, + typeInference.Columns, purposeInferenceResult); + + columnResults = groupingResult.Select(c => c.GenerateTextLoaderColumn()); + purposeResults = groupingResult.Select(c => (c.SuggestedName, c.Purpose)); + } + else + { + columnResults = loaderColumns; + purposeResults = purposeInferenceResult.Select(p => (dataView.Schema[p.ColumnIndex].Name, p.Purpose)); + } + + var textLoaderOptions = new TextLoader.Options() + { + Columns = columnResults.ToArray(), + AllowQuoting = splitInference.AllowQuote, + AllowSparse = splitInference.AllowSparse, + Separators = new char[] { splitInference.Separator.Value }, + HasHeader = hasHeader, + TrimWhitespace = trimWhitespace + }; + + return new ColumnInferenceResults() + { + TextLoaderOptions = textLoaderOptions, + ColumnInformation = ColumnInformationUtil.BuildColumnInfo(purposeResults) + }; + } + + private static TextFileContents.ColumnSplitResult InferSplit(MLContext context, TextFileSample sample, char? separatorChar, bool? allowQuotedStrings, bool? supportSparse) + { + var separatorCandidates = separatorChar == null ? TextFileContents.DefaultSeparators : new char[] { separatorChar.Value }; + var splitInference = TextFileContents.TrySplitColumns(context, sample, separatorCandidates); + + // respect passed-in overrides + if (allowQuotedStrings != null) + { + splitInference.AllowQuote = allowQuotedStrings.Value; + } + if (supportSparse != null) + { + splitInference.AllowSparse = supportSparse.Value; + } + + if (!splitInference.IsSuccess) + { + throw new InferenceException(InferenceExceptionType.ColumnSplit, "Unable to split the file provided into multiple, consistent columns."); + } + + return splitInference; + } + + private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext context, TextFileSample sample, + TextFileContents.ColumnSplitResult splitInference, bool hasHeader, uint? labelColumnIndex, string label) + { + // infer column types + var typeInferenceResult = ColumnTypeInference.InferTextFileColumnTypes(context, sample, + new ColumnTypeInference.Arguments + { + ColumnCount = splitInference.ColumnCount, + Separator = splitInference.Separator.Value, + AllowSparse = splitInference.AllowSparse, + AllowQuote = splitInference.AllowQuote, + HasHeader = hasHeader, + LabelColumnIndex = labelColumnIndex, + Label = label + }); + + if (!typeInferenceResult.IsSuccess) + { + throw new InferenceException(InferenceExceptionType.ColumnDataType, "Unable to infer column types of the file provided."); + } + + return typeInferenceResult; + } + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceValidationUtil.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceValidationUtil.cs new file mode 100644 index 0000000000..cff2cad92d --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceValidationUtil.cs @@ -0,0 +1,28 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.AutoML +{ + internal static class ColumnInferenceValidationUtil + { + /// + /// Validate all columns specified in column info exist in inferred data view. + /// + public static void ValidateSpecifiedColumnsExist(ColumnInformation columnInfo, + IDataView dataView) + { + var columnNames = ColumnInformationUtil.GetColumnNames(columnInfo); + foreach (var columnName in columnNames) + { + if (dataView.Schema.GetColumnOrNull(columnName) == null) + { + throw new ArgumentException($"Specified column {columnName} " + + $"is not found in the dataset."); + } + } + } + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs new file mode 100644 index 0000000000..1228fc2183 --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInformationUtil.cs @@ -0,0 +1,144 @@ +// Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class ColumnInformationUtil + { + internal static ColumnPurpose? GetColumnPurpose(this ColumnInformation columnInfo, string columnName) + { + if (columnName == columnInfo.LabelColumnName) + { + return ColumnPurpose.Label; + } + + if (columnName == columnInfo.ExampleWeightColumnName) + { + return ColumnPurpose.Weight; + } + + if (columnName == columnInfo.SamplingKeyColumnName) + { + return ColumnPurpose.SamplingKey; + } + + if (columnInfo.CategoricalColumnNames.Contains(columnName)) + { + return ColumnPurpose.CategoricalFeature; + } + + if (columnInfo.NumericColumnNames.Contains(columnName)) + { + return ColumnPurpose.NumericFeature; + } + + if (columnInfo.TextColumnNames.Contains(columnName)) + { + return ColumnPurpose.TextFeature; + } + + if (columnInfo.IgnoredColumnNames.Contains(columnName)) + { + return ColumnPurpose.Ignore; + } + + return null; + } + + internal static ColumnInformation BuildColumnInfo(IEnumerable<(string name, ColumnPurpose purpose)> columnPurposes) + { + var columnInfo = new ColumnInformation(); + + foreach (var column in columnPurposes) + { + switch (column.purpose) + { + case ColumnPurpose.Label: + columnInfo.LabelColumnName = column.name; + break; + case ColumnPurpose.Weight: + columnInfo.ExampleWeightColumnName = column.name; + break; + case ColumnPurpose.SamplingKey: + columnInfo.SamplingKeyColumnName = column.name; + break; + case ColumnPurpose.CategoricalFeature: + columnInfo.CategoricalColumnNames.Add(column.name); + break; + case ColumnPurpose.Ignore: + columnInfo.IgnoredColumnNames.Add(column.name); + break; + case ColumnPurpose.NumericFeature: + columnInfo.NumericColumnNames.Add(column.name); + break; + case ColumnPurpose.TextFeature: + columnInfo.TextColumnNames.Add(column.name); + break; + } + } + + return columnInfo; + } + + public static ColumnInformation BuildColumnInfo(IEnumerable columns) + { + return BuildColumnInfo(columns.Select(c => (c.Name, c.Purpose))); + } + + /// + /// Get all column names that are in . + /// + /// Column information. + public static IEnumerable GetColumnNames(ColumnInformation columnInformation) + { + var columnNames = new List(); + AddStringToListIfNotNull(columnNames, columnInformation.LabelColumnName); + AddStringToListIfNotNull(columnNames, columnInformation.ExampleWeightColumnName); + AddStringToListIfNotNull(columnNames, columnInformation.SamplingKeyColumnName); + AddStringsToListIfNotNull(columnNames, columnInformation.CategoricalColumnNames); + AddStringsToListIfNotNull(columnNames, columnInformation.IgnoredColumnNames); + AddStringsToListIfNotNull(columnNames, columnInformation.NumericColumnNames); + AddStringsToListIfNotNull(columnNames, columnInformation.TextColumnNames); + return columnNames; + } + + public static IDictionary CountColumnsByPurpose(ColumnInformation columnInformation) + { + var result = new Dictionary(); + var columnNames = GetColumnNames(columnInformation); + foreach (var columnName in columnNames) + { + var purpose = columnInformation.GetColumnPurpose(columnName); + if (purpose == null) + { + continue; + } + + result.TryGetValue(purpose.Value, out int count); + result[purpose.Value] = ++count; + } + return result; + } + + private static void AddStringsToListIfNotNull(List list, IEnumerable strings) + { + foreach (var str in strings) + { + AddStringToListIfNotNull(list, str); + } + } + + private static void AddStringToListIfNotNull(List list, string str) + { + if (str != null) + { + list.Add(str); + } + } + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnPurpose.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnPurpose.cs new file mode 100644 index 0000000000..25339aea99 --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnPurpose.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal enum ColumnPurpose + { + Ignore = 0, + Label = 1, + NumericFeature = 2, + CategoricalFeature = 3, + TextFeature = 4, + Weight = 5, + ImagePath = 6, + SamplingKey = 7 + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs new file mode 100644 index 0000000000..1276b00e61 --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/ColumnTypeInference.cs @@ -0,0 +1,414 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using Microsoft.ML.Data; +using Microsoft.ML.Data.Conversion; + +namespace Microsoft.ML.AutoML +{ + /// + /// This class incapsulates logic for automatic inference of column types for the text file. + /// It also attempts to guess whether there is a header row. + /// + internal static class ColumnTypeInference + { + // Maximum number of columns to invoke type inference. + // REVIEW: revisit this requirement. Either work for arbitrary number of columns, + // or have a 'dumb' inference that would quickly figure everything out. + private const int SmartColumnsLim = 10000; + + internal sealed class Arguments + { + public char Separator; + public bool AllowSparse; + public bool AllowQuote; + public int ColumnCount; + public bool HasHeader; + public int MaxRowsToRead; + public uint? LabelColumnIndex; + public string Label; + + public Arguments() + { + MaxRowsToRead = 10000; + } + } + + private class IntermediateColumn + { + private readonly ReadOnlyMemory[] _data; + private readonly int _columnId; + private PrimitiveDataViewType _suggestedType; + private bool? _hasHeader; + + public int ColumnId + { + get { return _columnId; } + } + + public PrimitiveDataViewType SuggestedType + { + get { return _suggestedType; } + set { _suggestedType = value; } + } + + public bool? HasHeader + { + get { return _hasHeader; } + set { _hasHeader = value; } + } + + public IntermediateColumn(ReadOnlyMemory[] data, int columnId) + { + _data = data; + _columnId = columnId; + } + + public ReadOnlyMemory[] RawData { get { return _data; } } + + public string Name { get; set; } + + public bool HasAllBooleanValues() + { + if (RawData.Skip(1) + .All(x => + { + bool value; + // (note: Conversions.Instance.TryParse parses an empty string as a Boolean) + return !string.IsNullOrEmpty(x.ToString()) && + Conversions.Instance.TryParse(in x, out value); + })) + { + return true; + } + + return false; + } + } + + public class Column + { + public readonly int ColumnIndex; + + public PrimitiveDataViewType ItemType; + public string SuggestedName; + + public Column(int columnIndex, string suggestedName, PrimitiveDataViewType itemType) + { + ColumnIndex = columnIndex; + SuggestedName = suggestedName; + ItemType = itemType; + } + } + + public readonly struct InferenceResult + { + public readonly Column[] Columns; + public readonly bool HasHeader; + public readonly bool IsSuccess; + public readonly ReadOnlyMemory[][] Data; + + private InferenceResult(bool isSuccess, Column[] columns, bool hasHeader, ReadOnlyMemory[][] data) + { + IsSuccess = isSuccess; + Columns = columns; + HasHeader = hasHeader; + Data = data; + } + + public static InferenceResult Success(Column[] columns, bool hasHeader, ReadOnlyMemory[][] data) + { + return new InferenceResult(true, columns, hasHeader, data); + } + + public static InferenceResult Fail() + { + return new InferenceResult(false, null, false, null); + } + } + + private interface ITypeInferenceExpert + { + void Apply(IntermediateColumn[] columns); + } + + /// + /// Current design is as follows: there's a sequence of 'experts' that each look at all the columns. + /// Every expert may or may not assign the 'answer' (suggested type) to a column. If the expert needs + /// some information about the column (for example, the column values), this information is lazily calculated + /// by the column object, not the expert itself, to allow the reuse of the same information by another + /// expert. + /// + private static class Experts + { + internal sealed class BooleanValues : ITypeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var col in columns) + { + // skip columns that already have a suggested type, + // or that don't have all Boolean values + if (col.SuggestedType != null || + !col.HasAllBooleanValues()) + { + continue; + } + + col.SuggestedType = BooleanDataViewType.Instance; + bool first; + + col.HasHeader = !Conversions.Instance.TryParse(in col.RawData[0], out first); + } + } + } + + internal sealed class AllNumericValues : ITypeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var col in columns) + { + if (!col.RawData.Skip(1) + .All(x => + { + float value; + return Conversions.Instance.TryParse(in x, out value); + }) + ) + { + continue; + } + + col.SuggestedType = NumberDataViewType.Single; + + var headerStr = col.RawData[0].ToString(); + col.HasHeader = !double.TryParse(headerStr, out var doubleVal); + } + } + } + + internal sealed class EverythingText : ITypeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var col in columns) + { + if (col.SuggestedType != null) + continue; + + col.SuggestedType = TextDataViewType.Instance; + col.HasHeader = IsLookLikeHeader(col.RawData[0]); + } + } + + private bool? IsLookLikeHeader(ReadOnlyMemory value) + { + var v = value.ToString(); + if (v.Length > 100) + return false; + var headerCandidates = new[] { "^Label", "^Feature", "^Market", "^m_", "^Weight" }; + foreach (var candidate in headerCandidates) + { + if (Regex.IsMatch(v, candidate, RegexOptions.IgnoreCase)) + return true; + } + + return null; + } + } + } + + private static IEnumerable GetExperts() + { + // Current logic is pretty primitive: if every value (except the first) of a column + // parses as numeric then it's numeric. Else if it parses as a Boolean, it's Boolean. Otherwise, it is text. + yield return new Experts.AllNumericValues(); + yield return new Experts.BooleanValues(); + yield return new Experts.EverythingText(); + } + + /// + /// Auto-detect column types of the file. + /// + public static InferenceResult InferTextFileColumnTypes(MLContext context, IMultiStreamSource fileSource, Arguments args) + { + return InferTextFileColumnTypesCore(context, fileSource, args); + } + + private static InferenceResult InferTextFileColumnTypesCore(MLContext context, IMultiStreamSource fileSource, Arguments args) + { + if (args.ColumnCount == 0) + { + // too many empty columns for automatic inference + return InferenceResult.Fail(); + } + + if (args.ColumnCount >= SmartColumnsLim) + { + // too many columns for automatic inference + return InferenceResult.Fail(); + } + + // read the file as the specified number of text columns + var textLoaderOptions = new TextLoader.Options + { + Columns = new[] { new TextLoader.Column("C", DataKind.String, 0, args.ColumnCount - 1) }, + Separators = new[] { args.Separator }, + AllowSparse = args.AllowSparse, + AllowQuoting = args.AllowQuote, + }; + var textLoader = context.Data.CreateTextLoader(textLoaderOptions); + var idv = textLoader.Load(fileSource); + idv = context.Data.TakeRows(idv, args.MaxRowsToRead); + + // read all the data into memory. + // list items are rows of the dataset. + var data = new List[]>(); + using (var cursor = idv.GetRowCursor(idv.Schema)) + { + var column = cursor.Schema.GetColumnOrNull("C").Value; + var colType = column.Type; + ValueGetter>> vecGetter = null; + ValueGetter> oneGetter = null; + bool isVector = colType.IsVector(); + if (isVector) { vecGetter = cursor.GetGetter>>(column); } + else + { + oneGetter = cursor.GetGetter>(column); + } + + VBuffer> line = default; + ReadOnlyMemory tsValue = default; + while (cursor.MoveNext()) + { + if (isVector) + { + vecGetter(ref line); + var values = new ReadOnlyMemory[args.ColumnCount]; + line.CopyTo(values); + data.Add(values); + } + else + { + oneGetter(ref tsValue); + var values = new[] { tsValue }; + data.Add(values); + } + } + } + + if (data.Count < 2) + { + // too few rows for automatic inference + return InferenceResult.Fail(); + } + + var cols = new IntermediateColumn[args.ColumnCount]; + for (int i = 0; i < args.ColumnCount; i++) + { + cols[i] = new IntermediateColumn(data.Select(x => x[i]).ToArray(), i); + } + + foreach (var expert in GetExperts()) + { + expert.Apply(cols); + } + + // Aggregating header signals. + int suspect = 0; + var usedNames = new HashSet(); + for (int i = 0; i < args.ColumnCount; i++) + { + if (cols[i].HasHeader == true) + { + if (usedNames.Add(cols[i].RawData[0].ToString())) + suspect++; + else + { + // duplicate value in the first column is a strong signal that this is not a header + suspect -= args.ColumnCount; + } + } + else if (cols[i].HasHeader == false) + suspect--; + } + + // suggest names + usedNames.Clear(); + foreach (var col in cols) + { + string name0; + string name; + name0 = name = SuggestName(col, args.HasHeader); + int i = 0; + while (!usedNames.Add(name)) + { + name = string.Format("{0}_{1:00}", name0, i++); + } + col.Name = name; + } + + // validate & retrieve label column + var labelColumn = GetAndValidateLabelColumn(args, cols); + + // if label column has all Boolean values, set its type as Boolean + if (labelColumn.HasAllBooleanValues()) + { + labelColumn.SuggestedType = BooleanDataViewType.Instance; + } + + var outCols = cols.Select(x => new Column(x.ColumnId, x.Name, x.SuggestedType)).ToArray(); + + return InferenceResult.Success(outCols, args.HasHeader, cols.Select(col => col.RawData).ToArray()); + } + + private static string SuggestName(IntermediateColumn column, bool hasHeader) + { + var header = column.RawData[0].ToString(); + return (hasHeader && !string.IsNullOrWhiteSpace(header)) ? header : string.Format("col{0}", column.ColumnId); + } + + private static IntermediateColumn GetAndValidateLabelColumn(Arguments args, IntermediateColumn[] cols) + { + IntermediateColumn labelColumn = null; + if (args.LabelColumnIndex != null) + { + // if label column index > inferred # of columns, throw error + if (args.LabelColumnIndex >= cols.Count()) + { + throw new ArgumentOutOfRangeException(nameof(args.LabelColumnIndex), $"Label column index ({args.LabelColumnIndex}) is >= than # of inferred columns ({cols.Count()})."); + } + + labelColumn = cols[args.LabelColumnIndex.Value]; + } + else + { + labelColumn = cols.FirstOrDefault(c => c.Name == args.Label); + if (labelColumn == null) + { + throw new ArgumentException($"Specified label column '{args.Label}' was not found."); + } + } + + return labelColumn; + } + + public static TextLoader.Column[] GenerateLoaderColumns(Column[] columns) + { + var loaderColumns = new List(); + foreach (var col in columns) + { + var loaderColumn = new TextLoader.Column(col.SuggestedName, col.ItemType.GetRawKind().ToDataKind(), col.ColumnIndex); + loaderColumns.Add(loaderColumn); + } + return loaderColumns.ToArray(); + } + } + +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/PurposeInference.cs b/src/Microsoft.ML.AutoML/ColumnInference/PurposeInference.cs new file mode 100644 index 0000000000..cc568abdcb --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/PurposeInference.cs @@ -0,0 +1,283 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// Automatic inference of column purposes for the data view. + /// This is used in the context of text import wizard, but can be used outside as well. + /// + internal static class PurposeInference + { + public const int MaxRowsToRead = 1000; + + public class Column + { + public readonly int ColumnIndex; + public readonly ColumnPurpose Purpose; + + public Column(int columnIndex, ColumnPurpose purpose) + { + ColumnIndex = columnIndex; + Purpose = purpose; + } + } + + /// + /// The design is the same as for : there's a sequence of 'experts' + /// that each look at all the columns. Every expert may or may not assign the 'answer' (suggested purpose) + /// to a column. If the expert needs some information about the column (for example, the column values), + /// this information is lazily calculated by the column object, not the expert itself, to allow the reuse + /// of the same information by another expert. + /// + private interface IPurposeInferenceExpert + { + void Apply(IntermediateColumn[] columns); + } + + private class IntermediateColumn + { + private readonly IDataView _data; + private readonly int _columnId; + private ColumnPurpose _suggestedPurpose; + private readonly Lazy _type; + private readonly Lazy _columnName; + private IReadOnlyList> _cachedData; + + public bool IsPurposeSuggested { get; private set; } + + public ColumnPurpose SuggestedPurpose + { + get { return _suggestedPurpose; } + set + { + _suggestedPurpose = value; + IsPurposeSuggested = true; + } + } + + public DataViewType Type { get { return _type.Value; } } + + public string ColumnName { get { return _columnName.Value; } } + + public IntermediateColumn(IDataView data, int columnId, ColumnPurpose suggestedPurpose = ColumnPurpose.Ignore) + { + _data = data; + _columnId = columnId; + _type = new Lazy(() => _data.Schema[_columnId].Type); + _columnName = new Lazy(() => _data.Schema[_columnId].Name); + _suggestedPurpose = suggestedPurpose; + } + + public Column GetColumn() + { + return new Column(_columnId, _suggestedPurpose); + } + + public IReadOnlyList> GetColumnData() + { + if (_cachedData != null) + return _cachedData; + + var results = new List>(); + var column = _data.Schema[_columnId]; + + using (var cursor = _data.GetRowCursor(new[] { column })) + { + var getter = cursor.GetGetter>(column); + while (cursor.MoveNext()) + { + var value = default(ReadOnlyMemory); + getter(ref value); + + var copy = new ReadOnlyMemory(value.ToArray()); + + results.Add(copy); + } + } + + _cachedData = results; + + return results; + } + } + + private static class Experts + { + internal sealed class TextClassification : IPurposeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + string[] commonImageExtensions = { ".bmp", ".dib", ".rle", ".jpg", ".jpeg", ".jpe", ".jfif", ".gif", ".tif", ".tiff", ".png" }; + foreach (var column in columns) + { + if (column.IsPurposeSuggested || !column.Type.IsText()) + continue; + + var data = column.GetColumnData(); + + long sumLength = 0; + int sumSpaces = 0; + var seen = new HashSet(); + int imagePathCount = 0; + foreach (var span in data) + { + sumLength += span.Length; + seen.Add(span.ToString()); + string spanStr = span.ToString(); + sumSpaces += spanStr.Count(x => x == ' '); + + foreach (var ext in commonImageExtensions) + { + if (spanStr.EndsWith(ext, StringComparison.OrdinalIgnoreCase)) + { + imagePathCount++; + break; + } + } + } + + if (imagePathCount < data.Count - 1) + { + Double avgLength = 1.0 * sumLength / data.Count; + Double cardinalityRatio = 1.0 * seen.Count / data.Count; + Double avgSpaces = 1.0 * sumSpaces / data.Count; + if (cardinalityRatio < 0.7) + column.SuggestedPurpose = ColumnPurpose.CategoricalFeature; + // (note: the columns.Count() == 1 condition below, in case a dataset has only + // a 'name' and a 'label' column, forces what would be an 'ignore' column to become a text feature) + else if (cardinalityRatio >= 0.85 && (avgLength > 30 || avgSpaces >= 1 || columns.Count() == 1)) + column.SuggestedPurpose = ColumnPurpose.TextFeature; + else if (cardinalityRatio >= 0.9) + column.SuggestedPurpose = ColumnPurpose.Ignore; + } + else + column.SuggestedPurpose = ColumnPurpose.ImagePath; + } + } + } + + internal sealed class NumericAreFeatures : IPurposeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var column in columns) + { + if (column.IsPurposeSuggested) + continue; + if (column.Type.GetItemType().IsNumber()) + column.SuggestedPurpose = ColumnPurpose.NumericFeature; + } + } + } + + internal sealed class BooleanProcessing : IPurposeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var column in columns) + { + if (column.IsPurposeSuggested) + continue; + if (column.Type.GetItemType().IsBool()) + column.SuggestedPurpose = ColumnPurpose.NumericFeature; + } + } + } + + internal sealed class TextArraysAreText : IPurposeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var column in columns) + { + if (column.IsPurposeSuggested) + continue; + if (column.Type.IsVector() && column.Type.GetItemType().IsText()) + column.SuggestedPurpose = ColumnPurpose.TextFeature; + } + } + } + + internal sealed class IgnoreEverythingElse : IPurposeInferenceExpert + { + public void Apply(IntermediateColumn[] columns) + { + foreach (var column in columns) + { + if (!column.IsPurposeSuggested) + column.SuggestedPurpose = ColumnPurpose.Ignore; + } + } + } + } + + private static IEnumerable GetExperts() + { + // Each of the experts respects the decisions of all the experts above. + + // Single-value text columns may be category, name, text or ignore. + yield return new Experts.TextClassification(); + // Vector-value text columns are always treated as text. + // REVIEW: could be improved. + yield return new Experts.TextArraysAreText(); + // Check column on boolean only values. + yield return new Experts.BooleanProcessing(); + // All numeric columns are features. + yield return new Experts.NumericAreFeatures(); + // Everything else is ignored. + yield return new Experts.IgnoreEverythingElse(); + } + + /// + /// Auto-detect purpose for the data view columns. + /// + public static PurposeInference.Column[] InferPurposes(MLContext context, IDataView data, + ColumnInformation columnInfo) + { + data = context.Data.TakeRows(data, MaxRowsToRead); + + var allColumns = new List(); + var columnsToInfer = new List(); + + for (var i = 0; i < data.Schema.Count; i++) + { + var column = data.Schema[i]; + IntermediateColumn intermediateCol; + + if (column.IsHidden) + { + intermediateCol = new IntermediateColumn(data, i, ColumnPurpose.Ignore); + allColumns.Add(intermediateCol); + continue; + } + + var columnPurpose = columnInfo.GetColumnPurpose(column.Name); + if (columnPurpose == null) + { + intermediateCol = new IntermediateColumn(data, i); + columnsToInfer.Add(intermediateCol); + } + else + { + intermediateCol = new IntermediateColumn(data, i, columnPurpose.Value); + } + + allColumns.Add(intermediateCol); + } + + foreach (var expert in GetExperts()) + { + expert.Apply(columnsToInfer.ToArray()); + } + + return allColumns.Select(c => c.GetColumn()).ToArray(); + } + } +} diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs new file mode 100644 index 0000000000..fe0066ab6e --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs @@ -0,0 +1,124 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// Utilities for various heuristics against text files. + /// Currently, separator inference and column count detection. + /// + internal static class TextFileContents + { + public class ColumnSplitResult + { + public readonly bool IsSuccess; + public readonly char? Separator; + public readonly int ColumnCount; + + public bool AllowQuote { get; set; } + public bool AllowSparse { get; set; } + + public ColumnSplitResult(bool isSuccess, char? separator, bool allowQuote, bool allowSparse, int columnCount) + { + IsSuccess = isSuccess; + Separator = separator; + AllowQuote = allowQuote; + AllowSparse = allowSparse; + ColumnCount = columnCount; + } + } + + // If the fraction of lines having the same number of columns exceeds this, we consider the column count to be known. + private const Double UniformColumnCountThreshold = 0.98; + + public static readonly char[] DefaultSeparators = { '\t', ',', ' ', ';' }; + + /// + /// Attempt to detect text loader arguments. + /// The algorithm selects the first 'acceptable' set: the one that recognizes the same number of columns in at + /// least of the sample's lines, + /// and this number of columns is more than 1. + /// We sweep on separator, allow sparse and allow quote parameter. + /// + public static ColumnSplitResult TrySplitColumns(MLContext context, IMultiStreamSource source, char[] separatorCandidates) + { + var sparse = new[] { false, true }; + var quote = new[] { true, false }; + var foundAny = false; + var result = default(ColumnSplitResult); + foreach (var perm in (from _allowSparse in sparse + from _allowQuote in quote + from _sep in separatorCandidates + select new { _allowSparse, _allowQuote, _sep })) + { + var options = new TextLoader.Options + { + Columns = new[] { new TextLoader.Column() { + Name = "C", + DataKind = DataKind.String, + Source = new[] { new TextLoader.Range(0, null) } + } }, + Separators = new[] { perm._sep }, + AllowQuoting = perm._allowQuote, + AllowSparse = perm._allowSparse + }; + + if (TryParseFile(context, options, source, out result)) + { + foundAny = true; + break; + } + } + return foundAny ? result : new ColumnSplitResult(false, null, true, true, 0); + } + + private static bool TryParseFile(MLContext context, TextLoader.Options options, IMultiStreamSource source, + out ColumnSplitResult result) + { + result = null; + // try to instantiate data view with swept arguments + try + { + var textLoader = context.Data.CreateTextLoader(options, source); + var idv = context.Data.TakeRows(textLoader.Load(source), 1000); + var columnCounts = new List(); + var column = idv.Schema["C"]; + + using (var cursor = idv.GetRowCursor(new[] { column })) + { + var getter = cursor.GetGetter>>(column); + + VBuffer> line = default; + while (cursor.MoveNext()) + { + getter(ref line); + columnCounts.Add(line.Length); + } + } + + var mostCommon = columnCounts.GroupBy(x => x).OrderByDescending(x => x.Count()).First(); + if (mostCommon.Count() < UniformColumnCountThreshold * columnCounts.Count) + { + return false; + } + + // disallow single-column case + if (mostCommon.Key <= 1) { return false; } + + result = new ColumnSplitResult(true, options.Separators.First(), options.AllowQuoting, options.AllowSparse, mostCommon.Key); + return true; + } + // fail gracefully if unable to instantiate data view with swept arguments + catch(Exception) + { + return false; + } + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileSample.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileSample.cs new file mode 100644 index 0000000000..5d9510f720 --- /dev/null +++ b/src/Microsoft.ML.AutoML/ColumnInference/TextFileSample.cs @@ -0,0 +1,304 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// This class holds an in-memory sample of the text file, and serves as an proxy to it. + /// + internal sealed class TextFileSample : IMultiStreamSource + { + // REVIEW: consider including multiple files via IMultiStreamSource. + + // REVIEW: right now, it expects 0x0A being the trailing character of line break. + // Consider a more general implementation. + + private const int BufferSizeMb = 4; + private const int FirstChunkSizeMb = 1; + private const int LinesPerChunk = 20; + private const Double OversamplingRate = 1.1; + + private readonly byte[] _buffer; + private readonly long? _fullFileSize; + private readonly long? _approximateRowCount; + + private TextFileSample(byte[] buffer, long? fullFileSize, long? lineCount) + { + _buffer = buffer; + _fullFileSize = fullFileSize; + _approximateRowCount = lineCount; + } + + public int Count + { + get { return 1; } + } + + // Full file size, if known, otherwise, null. + public long? FullFileSize + { + get { return _fullFileSize; } + } + + public int SampleSize + { + get { return _buffer.Length; } + } + + public string GetPathOrNull(int index) + { + //Contracts.Check(index == 0, "Index must be 0"); + return null; + } + + public Stream Open(int index) + { + //Contracts.Check(index == 0, "Index must be 0"); + return new MemoryStream(_buffer); + } + + public TextReader OpenTextReader(int index) + { + return new StreamReader(Open(index)); + } + + public long? ApproximateRowCount => _approximateRowCount; + + public static TextFileSample CreateFromFullFile(string path) + { + using (var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + return CreateFromFullStream(fs); + } + } + + /// + /// Create a by reading multiple chunks from the file (or other source) and + /// then stitching them together. The algorithm is as follows: + /// 0. If the source is not seekable, revert to . + /// 1. If the file length is less than 2 * , revert to . + /// 2. Read first MB chunk. Determine average line length in the chunk. + /// 3. Determine how large one chunk should be, and how many chunks there should be, to end up + /// with * MB worth of lines. + /// 4. Determine seek locations and read the chunks. + /// 5. Stitch and return a . + /// + public static TextFileSample CreateFromFullStream(Stream stream) + { + if (!stream.CanSeek) + { + return CreateFromHead(stream); + } + var fileSize = stream.Length; + + if (fileSize <= 2 * BufferSizeMb * (1 << 20)) + { + return CreateFromHead(stream); + } + + var firstChunk = new byte[FirstChunkSizeMb * (1 << 20)]; + int count = stream.Read(firstChunk, 0, firstChunk.Length); + if (!IsEncodingOkForSampling(firstChunk)) + return CreateFromHead(stream); + // REVIEW: CreateFromHead still truncates the file before the last 0x0A byte. For multi-byte encoding, + // this might cause an unfinished string to be present in the buffer. Right now this is considered an acceptable + // price to pay for parse-free processing. + + var lineCount = firstChunk.Count(x => x == '\n'); + if (lineCount == 0) + { + throw new ArgumentException("Counldn't identify line breaks. Provided file is not text?"); + } + + long approximateRowCount = (long)(lineCount * fileSize * 1.0 / firstChunk.Length); + var firstNewline = Array.FindIndex(firstChunk, x => x == '\n'); + + // First line may be header, so we exclude it. The remaining lineCount-1 line breaks are + // splitting the text into lineCount lines, and the last line is actually half-size. + Double averageLineLength = 2.0 * (firstChunk.Length - firstNewline) / (lineCount * 2 - 1); + averageLineLength = Math.Max(averageLineLength, 3); + + int usefulChunkSize = (int)(averageLineLength * LinesPerChunk); + int chunkSize = (int)(usefulChunkSize + averageLineLength); // assuming that 1 line worth will be trimmed out + + int chunkCount = (int)Math.Ceiling((BufferSizeMb * OversamplingRate - FirstChunkSizeMb) * (1 << 20) / usefulChunkSize); + int maxChunkCount = (int)Math.Floor((double)(fileSize - firstChunk.Length) / chunkSize); + chunkCount = Math.Min(chunkCount, maxChunkCount); + + var chunks = new List(); + chunks.Add(firstChunk); + + // determine the start of each remaining chunk + long fileSizeRemaining = fileSize - firstChunk.Length - ((long)chunkSize) * chunkCount; + + var chunkStartIndices = Enumerable.Range(0, chunkCount) + .Select(x => AutoMlUtils.Random.Value.NextDouble() * fileSizeRemaining) + .OrderBy(x => x) + .Select((spot, i) => (long)(spot + firstChunk.Length + i * chunkSize)) + .ToArray(); + + foreach (var chunkStartIndex in chunkStartIndices) + { + stream.Seek(chunkStartIndex, SeekOrigin.Begin); + byte[] chunk = new byte[chunkSize]; + int readCount = stream.Read(chunk, 0, chunkSize); + Array.Resize(ref chunk, chunkSize); + chunks.Add(chunk); + } + + return new TextFileSample(StitchChunks(false, chunks.ToArray()), fileSize, approximateRowCount); + } + + /// + /// Create a by reading one chunk from the beginning. + /// + private static TextFileSample CreateFromHead(Stream stream) + { + var buf = new byte[BufferSizeMb * (1 << 20)]; + int readCount = stream.Read(buf, 0, buf.Length); + Array.Resize(ref buf, readCount); + long? multiplier = stream.CanSeek ? (int?)(stream.Length / buf.Length) : null; + return new TextFileSample(StitchChunks(readCount == stream.Length, buf), + stream.CanSeek ? (long?)stream.Length : null, + multiplier.HasValue ? buf.Count(x => x == '\n') * multiplier : null); + } + + /// + /// Given an array of chunks of the text file, of which the first chunk is the head, + /// this method trims incomplete lines from the beginning and end of each chunk + /// (except that it doesn't trim the beginning of the first chunk and end of last chunk if we read whole file), + /// then joins the rest together to form a final byte buffer and returns a + /// wrapped around it. + /// + /// did we read whole file + /// chunks of data + /// + private static byte[] StitchChunks(bool wholeFile, params byte[][] chunks) + { + using (var resultStream = new MemoryStream(BufferSizeMb * (1 << 20))) + { + for (int i = 0; i < chunks.Length; i++) + { + int iMin = (i == 0) ? 0 : Array.FindIndex(chunks[i], x => x == '\n') + 1; + int iLim = (wholeFile && i == chunks.Length - 1) + ? chunks[i].Length + : Array.FindLastIndex(chunks[i], x => x == '\n') + 1; + + if (iLim == 0) + { + //entire buffer is one string, skip + continue; + } + + resultStream.Write(chunks[i], iMin, iLim - iMin); + } + + var resultBuffer = resultStream.ToArray(); + if (resultBuffer.Length == 0) + { + throw new ArgumentException("File is not text, or couldn't detect line breaks"); + } + + return resultBuffer; + } + } + + /// + /// Detect whether we can auto-detect EOL characters without parsing. + /// If we do, we can cheaply sample from different file locations and trim the partial strings. + /// The encodings that pass the test are UTF8 and all single-byte encodings. + /// + private static bool IsEncodingOkForSampling(byte[] buffer) + { + // First check if a BOM/signature exists (sourced from https://www.unicode.org/faq/utf_bom.html#bom4) + if (buffer.Length >= 4 && buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xFE && buffer[3] == 0xFF) + { + // UTF-32, big-endian + return false; + } + if (buffer.Length >= 4 && buffer[0] == 0xFF && buffer[1] == 0xFE && buffer[2] == 0x00 && buffer[3] == 0x00) + { + // UTF-32, little-endian + return false; + } + if (buffer.Length >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) + { + // UTF-16, big-endian + return false; + } + if (buffer.Length >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) + { + // UTF-16, little-endian + return false; + } + if (buffer.Length >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) + { + // UTF-8 + return true; + } + if (buffer.Length >= 3 && buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) + { + // UTF-7 + return true; + } + + // No BOM/signature was found, so now we need to 'sniff' the file to see if can manually discover the encoding. + int sniffLim = Math.Min(1000, buffer.Length); + + // Some text files are encoded in UTF8, but have no BOM/signature. Hence the below manually checks for a UTF8 pattern. This code is based off + // the top answer at: https://stackoverflow.com/questions/6555015/check-for-invalid-utf8 . + int i = 0; + bool utf8 = false; + while (i < sniffLim - 4) + { + if (buffer[i] <= 0x7F) + { + i += 1; + continue; + } + if (buffer[i] >= 0xC2 && buffer[i] <= 0xDF && buffer[i + 1] >= 0x80 && buffer[i + 1] < 0xC0) + { + i += 2; + utf8 = true; + continue; + } + if (buffer[i] >= 0xE0 && buffer[i] <= 0xF0 && buffer[i + 1] >= 0x80 && buffer[i + 1] < 0xC0 && + buffer[i + 2] >= 0x80 && buffer[i + 2] < 0xC0) + { + i += 3; + utf8 = true; + continue; + } + if (buffer[i] >= 0xF0 && buffer[i] <= 0xF4 && buffer[i + 1] >= 0x80 && buffer[i + 1] < 0xC0 && + buffer[i + 2] >= 0x80 && buffer[i + 2] < 0xC0 && buffer[i + 3] >= 0x80 && buffer[i + 3] < 0xC0) + { + i += 4; + utf8 = true; + continue; + } + utf8 = false; + break; + } + if (utf8) + { + return true; + } + + if (buffer.Take(sniffLim).Any(x => x == 0)) + { + // likely a UTF-16 or UTF-32 without a BOM. + return false; + } + + // If all else failed, the file is likely in a local 1-byte encoding. + return true; + } + } +} diff --git a/src/Microsoft.ML.AutoML/DatasetDimensions/ColumnDimensions.cs b/src/Microsoft.ML.AutoML/DatasetDimensions/ColumnDimensions.cs new file mode 100644 index 0000000000..94512f4620 --- /dev/null +++ b/src/Microsoft.ML.AutoML/DatasetDimensions/ColumnDimensions.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal class ColumnDimensions + { + public int? Cardinality; + public bool? HasMissing; + + public ColumnDimensions(int? cardinality, bool? hasMissing) + { + Cardinality = cardinality; + HasMissing = hasMissing; + } + } +} diff --git a/src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsApi.cs b/src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsApi.cs new file mode 100644 index 0000000000..4282610fc4 --- /dev/null +++ b/src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsApi.cs @@ -0,0 +1,50 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class DatasetDimensionsApi + { + private const long MaxRowsToRead = 1000; + + public static ColumnDimensions[] CalcColumnDimensions(MLContext context, IDataView data, PurposeInference.Column[] purposes) + { + data = context.Data.TakeRows(data, MaxRowsToRead); + + var colDimensions = new ColumnDimensions[data.Schema.Count]; + + for (var i = 0; i < data.Schema.Count; i++) + { + var column = data.Schema[i]; + var purpose = purposes[i]; + + // default column dimensions + int? cardinality = null; + bool? hasMissing = null; + + var itemType = column.Type.GetItemType(); + + // If categorical text feature, calculate cardinality + if (itemType.IsText() && purpose.Purpose == ColumnPurpose.CategoricalFeature) + { + cardinality = DatasetDimensionsUtil.GetTextColumnCardinality(data, column); + } + + // If numeric feature, discover missing values + if (itemType == NumberDataViewType.Single) + { + hasMissing = column.Type.IsVector() ? + DatasetDimensionsUtil.HasMissingNumericVector(data, column) : + DatasetDimensionsUtil.HasMissingNumericSingleValue(data, column); + } + + colDimensions[i] = new ColumnDimensions(cardinality, hasMissing); + } + + return colDimensions; + } + } +} diff --git a/src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsUtil.cs b/src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsUtil.cs new file mode 100644 index 0000000000..a0a274836a --- /dev/null +++ b/src/Microsoft.ML.AutoML/DatasetDimensions/DatasetDimensionsUtil.cs @@ -0,0 +1,86 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Internal.Utilities; + +namespace Microsoft.ML.AutoML +{ + internal static class DatasetDimensionsUtil + { + public static int GetTextColumnCardinality(IDataView data, DataViewSchema.Column column) + { + var seen = new HashSet(); + using (var cursor = data.GetRowCursor(new[] { column })) + { + var getter = cursor.GetGetter>(column); + while (cursor.MoveNext()) + { + var value = default(ReadOnlyMemory); + getter(ref value); + var valueStr = value.ToString(); + seen.Add(valueStr); + } + } + return seen.Count; + } + + public static bool HasMissingNumericSingleValue(IDataView data, DataViewSchema.Column column) + { + using (var cursor = data.GetRowCursor(new[] { column })) + { + var getter = cursor.GetGetter(column); + var value = default(Single); + while (cursor.MoveNext()) + { + getter(ref value); + if (Single.IsNaN(value)) + { + return true; + } + } + return false; + } + } + + public static bool HasMissingNumericVector(IDataView data, DataViewSchema.Column column) + { + using (var cursor = data.GetRowCursor(new[] { column })) + { + var getter = cursor.GetGetter>(column); + var value = default(VBuffer); + while (cursor.MoveNext()) + { + getter(ref value); + if (VBufferUtils.HasNaNs(value)) + { + return true; + } + } + return false; + } + } + + public static ulong CountRows(IDataView data, ulong maxRows) + { + var cursor = data.GetRowCursor(new[] { data.Schema[0] }); + ulong rowCount = 0; + while (cursor.MoveNext()) + { + if (++rowCount == maxRows) + { + break; + } + } + return rowCount; + } + + public static bool IsDataViewEmpty(IDataView data) + { + return CountRows(data, 1) == 0; + } + } +} diff --git a/src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensionCatalog.cs b/src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensionCatalog.cs new file mode 100644 index 0000000000..ad66920bfb --- /dev/null +++ b/src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensionCatalog.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.AutoML +{ + internal enum EstimatorName + { + ColumnConcatenating, + ColumnCopying, + KeyToValueMapping, + MissingValueIndicating, + MissingValueReplacing, + Normalizing, + OneHotEncoding, + OneHotHashEncoding, + TextFeaturizing, + TypeConverting, + ValueToKeyMapping + } + + internal class EstimatorExtensionCatalog + { + private static readonly IDictionary _namesToExtensionTypes = new + Dictionary() + { + { EstimatorName.ColumnConcatenating, typeof(ColumnConcatenatingExtension) }, + { EstimatorName.ColumnCopying, typeof(ColumnCopyingExtension) }, + { EstimatorName.KeyToValueMapping, typeof(KeyToValueMappingExtension) }, + { EstimatorName.MissingValueIndicating, typeof(MissingValueIndicatingExtension) }, + { EstimatorName.MissingValueReplacing, typeof(MissingValueReplacingExtension) }, + { EstimatorName.Normalizing, typeof(NormalizingExtension) }, + { EstimatorName.OneHotEncoding, typeof(OneHotEncodingExtension) }, + { EstimatorName.OneHotHashEncoding, typeof(OneHotHashEncodingExtension) }, + { EstimatorName.TextFeaturizing, typeof(TextFeaturizingExtension) }, + { EstimatorName.TypeConverting, typeof(TypeConvertingExtension) }, + { EstimatorName.ValueToKeyMapping, typeof(ValueToKeyMappingExtension) }, + }; + + public static IEstimatorExtension GetExtension(EstimatorName estimatorName) + { + var extType = _namesToExtensionTypes[estimatorName]; + return (IEstimatorExtension)Activator.CreateInstance(extType); + } + } +} diff --git a/src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensions.cs b/src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensions.cs new file mode 100644 index 0000000000..98b85f69da --- /dev/null +++ b/src/Microsoft.ML.AutoML/EstimatorExtensions/EstimatorExtensions.cs @@ -0,0 +1,272 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; +using Microsoft.ML.Transforms; + +namespace Microsoft.ML.AutoML +{ + internal class ColumnConcatenatingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns, pipelineNode.OutColumns[0]); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string[] inColumns, string outColumn) + { + var pipelineNode = new PipelineNode(EstimatorName.ColumnConcatenating.ToString(), + PipelineNodeType.Transform, inColumns, outColumn); + var estimator = CreateInstance(context, inColumns, outColumn); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string[] inColumns, string outColumn) + { + return context.Transforms.Concatenate(outColumn, inColumns); + } + } + + internal class ColumnCopyingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns[0], pipelineNode.OutColumns[0]); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string inColumn, string outColumn) + { + var pipelineNode = new PipelineNode(EstimatorName.ColumnCopying.ToString(), + PipelineNodeType.Transform, inColumn, outColumn); + var estimator = CreateInstance(context, inColumn, outColumn); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string inColumn, string outColumn) + { + return context.Transforms.CopyColumns(outColumn, inColumn); + } + } + + internal class KeyToValueMappingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns[0], pipelineNode.OutColumns[0]); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string inColumn, string outColumn) + { + var pipelineNode = new PipelineNode(EstimatorName.KeyToValueMapping.ToString(), + PipelineNodeType.Transform, inColumn, outColumn); + var estimator = CreateInstance(context, inColumn, outColumn); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string inColumn, string outColumn) + { + return context.Transforms.Conversion.MapKeyToValue(outColumn, inColumn); + } + } + + internal class MissingValueIndicatingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns, pipelineNode.OutColumns); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string[] inColumns, string[] outColumns) + { + var pipelineNode = new PipelineNode(EstimatorName.MissingValueIndicating.ToString(), + PipelineNodeType.Transform, inColumns, outColumns); + var estimator = CreateInstance(context, inColumns, outColumns); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string[] inColumns, string[] outColumns) + { + var pairs = new InputOutputColumnPair[inColumns.Length]; + for (var i = 0; i < inColumns.Length; i++) + { + var pair = new InputOutputColumnPair(outColumns[i], inColumns[i]); + pairs[i] = pair; + } + return context.Transforms.IndicateMissingValues(pairs); + } + } + + internal class MissingValueReplacingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns, pipelineNode.OutColumns); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string[] inColumns, string[] outColumns) + { + var pipelineNode = new PipelineNode(EstimatorName.MissingValueReplacing.ToString(), + PipelineNodeType.Transform, inColumns, outColumns); + var estimator = CreateInstance(context, inColumns, outColumns); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string[] inColumns, string[] outColumns) + { + var pairs = new InputOutputColumnPair[inColumns.Length]; + for (var i = 0; i < inColumns.Length; i++) + { + var pair = new InputOutputColumnPair(outColumns[i], inColumns[i]); + pairs[i] = pair; + } + return context.Transforms.ReplaceMissingValues(pairs); + } + } + + internal class NormalizingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns[0], pipelineNode.OutColumns[0]); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string inColumn, string outColumn) + { + var pipelineNode = new PipelineNode(EstimatorName.Normalizing.ToString(), + PipelineNodeType.Transform, inColumn, outColumn); + var estimator = CreateInstance(context, inColumn, outColumn); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string inColumn, string outColumn) + { + return context.Transforms.NormalizeMinMax(outColumn, inColumn); + } + } + + internal class OneHotEncodingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns, pipelineNode.OutColumns); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string[] inColumns, string[] outColumns) + { + var pipelineNode = new PipelineNode(EstimatorName.OneHotEncoding.ToString(), + PipelineNodeType.Transform, inColumns, outColumns); + var estimator = CreateInstance(context, inColumns, outColumns); + return new SuggestedTransform(pipelineNode, estimator); + } + + public static IEstimator CreateInstance(MLContext context, string[] inColumns, string[] outColumns) + { + var cols = new InputOutputColumnPair[inColumns.Length]; + for (var i = 0; i < cols.Length; i++) + { + cols[i] = new InputOutputColumnPair(outColumns[i], inColumns[i]); + } + return context.Transforms.Categorical.OneHotEncoding(cols); + } + } + + internal class OneHotHashEncodingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns, pipelineNode.OutColumns); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string inColumn, string outColumn) + { + return CreateSuggestedTransform(context, new[] { inColumn }, new[] { outColumn }); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string[] inColumns, string[] outColumns) + { + var pipelineNode = new PipelineNode(EstimatorName.OneHotHashEncoding.ToString(), + PipelineNodeType.Transform, inColumns, outColumns); + var estimator = CreateInstance(context, inColumns, outColumns); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string[] inColumns, string[] outColumns) + { + var cols = new InputOutputColumnPair[inColumns.Length]; + for (var i = 0; i < cols.Length; i++) + { + cols[i] = new InputOutputColumnPair(outColumns[i], inColumns[i]); + } + return context.Transforms.Categorical.OneHotHashEncoding(cols); + } + } + + internal class TextFeaturizingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns[0], pipelineNode.OutColumns[0]); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string inColumn, string outColumn) + { + var pipelineNode = new PipelineNode(EstimatorName.TextFeaturizing.ToString(), + PipelineNodeType.Transform, inColumn, outColumn); + var estimator = CreateInstance(context, inColumn, outColumn); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string inColumn, string outColumn) + { + return context.Transforms.Text.FeaturizeText(outColumn, inColumn); + } + } + + internal class TypeConvertingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns, pipelineNode.OutColumns); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string[] inColumns, string[] outColumns) + { + var pipelineNode = new PipelineNode(EstimatorName.TypeConverting.ToString(), + PipelineNodeType.Transform, inColumns, outColumns); + var estimator = CreateInstance(context, inColumns, outColumns); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string[] inColumns, string[] outColumns) + { + var cols = new InputOutputColumnPair[inColumns.Length]; + for (var i = 0; i < cols.Length; i++) + { + cols[i] = new InputOutputColumnPair(outColumns[i], inColumns[i]); + } + return context.Transforms.Conversion.ConvertType(cols); + } + } + + internal class ValueToKeyMappingExtension : IEstimatorExtension + { + public IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode) + { + return CreateInstance(context, pipelineNode.InColumns[0], pipelineNode.OutColumns[0]); + } + + public static SuggestedTransform CreateSuggestedTransform(MLContext context, string inColumn, string outColumn) + { + var pipelineNode = new PipelineNode(EstimatorName.ValueToKeyMapping.ToString(), + PipelineNodeType.Transform, inColumn, outColumn); + var estimator = CreateInstance(context, inColumn, outColumn); + return new SuggestedTransform(pipelineNode, estimator); + } + + private static IEstimator CreateInstance(MLContext context, string inColumn, string outColumn) + { + return context.Transforms.Conversion.MapValueToKey(outColumn, inColumn); + } + } +} diff --git a/src/Microsoft.ML.AutoML/EstimatorExtensions/IEstimatorExtension.cs b/src/Microsoft.ML.AutoML/EstimatorExtensions/IEstimatorExtension.cs new file mode 100644 index 0000000000..c865b9084b --- /dev/null +++ b/src/Microsoft.ML.AutoML/EstimatorExtensions/IEstimatorExtension.cs @@ -0,0 +1,11 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal interface IEstimatorExtension + { + IEstimator CreateInstance(MLContext context, PipelineNode pipelineNode); + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/Experiment.cs b/src/Microsoft.ML.AutoML/Experiment/Experiment.cs new file mode 100644 index 0000000000..480f776bcc --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/Experiment.cs @@ -0,0 +1,150 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + internal class Experiment where TRunDetail : RunDetail + { + private readonly MLContext _context; + private readonly OptimizingMetricInfo _optimizingMetricInfo; + private readonly TaskKind _task; + private readonly IProgress _progressCallback; + private readonly ExperimentSettings _experimentSettings; + private readonly IMetricsAgent _metricsAgent; + private readonly IEnumerable _trainerWhitelist; + private readonly DirectoryInfo _modelDirectory; + private readonly DatasetColumnInfo[] _datasetColumnInfo; + private readonly IRunner _runner; + private readonly IList _history; + private readonly IChannel _logger; + + public Experiment(MLContext context, + TaskKind task, + OptimizingMetricInfo metricInfo, + IProgress progressCallback, + ExperimentSettings experimentSettings, + IMetricsAgent metricsAgent, + IEnumerable trainerWhitelist, + DatasetColumnInfo[] datasetColumnInfo, + IRunner runner, + IChannel logger) + { + _context = context; + _history = new List(); + _optimizingMetricInfo = metricInfo; + _task = task; + _progressCallback = progressCallback; + _experimentSettings = experimentSettings; + _metricsAgent = metricsAgent; + _trainerWhitelist = trainerWhitelist; + _modelDirectory = GetModelDirectory(_experimentSettings.CacheDirectory); + _datasetColumnInfo = datasetColumnInfo; + _runner = runner; + _logger = logger; + } + + public IList Execute() + { + var stopwatch = Stopwatch.StartNew(); + var iterationResults = new List(); + + do + { + var iterationStopwatch = Stopwatch.StartNew(); + + // get next pipeline + var getPiplelineStopwatch = Stopwatch.StartNew(); + var pipeline = PipelineSuggester.GetNextInferredPipeline(_context, _history, _datasetColumnInfo, _task, _optimizingMetricInfo.IsMaximizing, _experimentSettings.CacheBeforeTrainer, _trainerWhitelist); + var pipelineInferenceTimeInSeconds = getPiplelineStopwatch.Elapsed.TotalSeconds; + + // break if no candidates returned, means no valid pipeline available + if (pipeline == null) + { + break; + } + + // evaluate pipeline + _logger.Trace($"Evaluating pipeline {pipeline.ToString()}"); + (SuggestedPipelineRunDetail suggestedPipelineRunDetail, TRunDetail runDetail) + = _runner.Run(pipeline, _modelDirectory, _history.Count + 1); + _history.Add(suggestedPipelineRunDetail); + WriteIterationLog(pipeline, suggestedPipelineRunDetail, iterationStopwatch); + + runDetail.RuntimeInSeconds = iterationStopwatch.Elapsed.TotalSeconds; + runDetail.PipelineInferenceTimeInSeconds = getPiplelineStopwatch.Elapsed.TotalSeconds; + + ReportProgress(runDetail); + iterationResults.Add(runDetail); + + // if model is perfect, break + if (_metricsAgent.IsModelPerfect(suggestedPipelineRunDetail.Score)) + { + break; + } + + // If after third run, all runs have failed so far, throw exception + if (_history.Count() == 3 && _history.All(r => !r.RunSucceded)) + { + throw new InvalidOperationException($"Training failed with the exception: {_history.Last().Exception}"); + } + + } while (_history.Count < _experimentSettings.MaxModels && + !_experimentSettings.CancellationToken.IsCancellationRequested && + stopwatch.Elapsed.TotalSeconds < _experimentSettings.MaxExperimentTimeInSeconds); + + return iterationResults; + } + + private static DirectoryInfo GetModelDirectory(DirectoryInfo rootDir) + { + if (rootDir == null) + { + return null; + } + var subdirs = rootDir.Exists ? + new HashSet(rootDir.EnumerateDirectories().Select(d => d.Name)) : + new HashSet(); + string experimentDir; + for (var i = 0; ; i++) + { + experimentDir = $"experiment{i}"; + if (!subdirs.Contains(experimentDir)) + { + break; + } + } + var experimentDirFullPath = Path.Combine(rootDir.FullName, experimentDir); + var experimentDirInfo = new DirectoryInfo(experimentDirFullPath); + if (!experimentDirInfo.Exists) + { + experimentDirInfo.Create(); + } + return experimentDirInfo; + } + + private void ReportProgress(TRunDetail iterationResult) + { + try + { + _progressCallback?.Report(iterationResult); + } + catch (Exception ex) + { + _logger.Error($"Progress report callback reported exception {ex}"); + } + } + + private void WriteIterationLog(SuggestedPipeline pipeline, SuggestedPipelineRunDetail runResult, Stopwatch stopwatch) + { + _logger.Trace($"{_history.Count}\t{runResult.Score}\t{stopwatch.Elapsed}\t{pipeline.ToString()}"); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/BinaryMetricsAgent.cs b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/BinaryMetricsAgent.cs new file mode 100644 index 0000000000..e6e0451e9b --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/BinaryMetricsAgent.cs @@ -0,0 +1,86 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class BinaryMetricsAgent : IMetricsAgent + { + private readonly MLContext _mlContext; + private readonly BinaryClassificationMetric _optimizingMetric; + + public BinaryMetricsAgent(MLContext mlContext, + BinaryClassificationMetric optimizingMetric) + { + _mlContext = mlContext; + _optimizingMetric = optimizingMetric; + } + + public double GetScore(BinaryClassificationMetrics metrics) + { + if (metrics == null) + { + return double.NaN; + } + + switch (_optimizingMetric) + { + case BinaryClassificationMetric.Accuracy: + return metrics.Accuracy; + case BinaryClassificationMetric.AreaUnderRocCurve: + return metrics.AreaUnderRocCurve; + case BinaryClassificationMetric.AreaUnderPrecisionRecallCurve: + return metrics.AreaUnderPrecisionRecallCurve; + case BinaryClassificationMetric.F1Score: + return metrics.F1Score; + case BinaryClassificationMetric.NegativePrecision: + return metrics.NegativePrecision; + case BinaryClassificationMetric.NegativeRecall: + return metrics.NegativeRecall; + case BinaryClassificationMetric.PositivePrecision: + return metrics.PositivePrecision; + case BinaryClassificationMetric.PositiveRecall: + return metrics.PositiveRecall; + default: + throw MetricsAgentUtil.BuildMetricNotSupportedException(_optimizingMetric); + } + } + + public bool IsModelPerfect(double score) + { + if (double.IsNaN(score)) + { + return false; + } + + switch (_optimizingMetric) + { + case BinaryClassificationMetric.Accuracy: + return score == 1; + case BinaryClassificationMetric.AreaUnderRocCurve: + return score == 1; + case BinaryClassificationMetric.AreaUnderPrecisionRecallCurve: + return score == 1; + case BinaryClassificationMetric.F1Score: + return score == 1; + case BinaryClassificationMetric.NegativePrecision: + return score == 1; + case BinaryClassificationMetric.NegativeRecall: + return score == 1; + case BinaryClassificationMetric.PositivePrecision: + return score == 1; + case BinaryClassificationMetric.PositiveRecall: + return score == 1; + default: + throw MetricsAgentUtil.BuildMetricNotSupportedException(_optimizingMetric); + } + } + + public BinaryClassificationMetrics EvaluateMetrics(IDataView data, string labelColumn) + { + return _mlContext.BinaryClassification.EvaluateNonCalibrated(data, labelColumn); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/IMetricsAgent.cs b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/IMetricsAgent.cs new file mode 100644 index 0000000000..3531809ba4 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/IMetricsAgent.cs @@ -0,0 +1,15 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal interface IMetricsAgent + { + double GetScore(T metrics); + + bool IsModelPerfect(double score); + + T EvaluateMetrics(IDataView data, string labelColumn); + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MetricsAgentUtil.cs b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MetricsAgentUtil.cs new file mode 100644 index 0000000000..68c2161c8b --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MetricsAgentUtil.cs @@ -0,0 +1,16 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.AutoML +{ + internal static class MetricsAgentUtil + { + public static NotSupportedException BuildMetricNotSupportedException(T optimizingMetric) + { + return new NotSupportedException($"{optimizingMetric} is not a supported sweep metric"); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MultiMetricsAgent.cs b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MultiMetricsAgent.cs new file mode 100644 index 0000000000..abd584ae53 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/MultiMetricsAgent.cs @@ -0,0 +1,74 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class MultiMetricsAgent : IMetricsAgent + { + private readonly MLContext _mlContext; + private readonly MulticlassClassificationMetric _optimizingMetric; + + public MultiMetricsAgent(MLContext mlContext, + MulticlassClassificationMetric optimizingMetric) + { + _mlContext = mlContext; + _optimizingMetric = optimizingMetric; + } + + public double GetScore(MulticlassClassificationMetrics metrics) + { + if (metrics == null) + { + return double.NaN; + } + + switch (_optimizingMetric) + { + case MulticlassClassificationMetric.MacroAccuracy: + return metrics.MacroAccuracy; + case MulticlassClassificationMetric.MicroAccuracy: + return metrics.MicroAccuracy; + case MulticlassClassificationMetric.LogLoss: + return metrics.LogLoss; + case MulticlassClassificationMetric.LogLossReduction: + return metrics.LogLossReduction; + case MulticlassClassificationMetric.TopKAccuracy: + return metrics.TopKAccuracy; + default: + throw MetricsAgentUtil.BuildMetricNotSupportedException(_optimizingMetric); + } + } + + public bool IsModelPerfect(double score) + { + if (double.IsNaN(score)) + { + return false; + } + + switch (_optimizingMetric) + { + case MulticlassClassificationMetric.MacroAccuracy: + return score == 1; + case MulticlassClassificationMetric.MicroAccuracy: + return score == 1; + case MulticlassClassificationMetric.LogLoss: + return score == 0; + case MulticlassClassificationMetric.LogLossReduction: + return score == 1; + case MulticlassClassificationMetric.TopKAccuracy: + return score == 1; + default: + throw MetricsAgentUtil.BuildMetricNotSupportedException(_optimizingMetric); + } + } + + public MulticlassClassificationMetrics EvaluateMetrics(IDataView data, string labelColumn) + { + return _mlContext.MulticlassClassification.Evaluate(data, labelColumn); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/RegressionMetricsAgent.cs b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/RegressionMetricsAgent.cs new file mode 100644 index 0000000000..94f5fcdae7 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/MetricsAgents/RegressionMetricsAgent.cs @@ -0,0 +1,69 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class RegressionMetricsAgent : IMetricsAgent + { + private readonly MLContext _mlContext; + private readonly RegressionMetric _optimizingMetric; + + public RegressionMetricsAgent(MLContext mlContext, RegressionMetric optimizingMetric) + { + _mlContext = mlContext; + _optimizingMetric = optimizingMetric; + } + + public double GetScore(RegressionMetrics metrics) + { + if (metrics == null) + { + return double.NaN; + } + + switch (_optimizingMetric) + { + case RegressionMetric.MeanAbsoluteError: + return metrics.MeanAbsoluteError; + case RegressionMetric.MeanSquaredError: + return metrics.MeanSquaredError; + case RegressionMetric.RootMeanSquaredError: + return metrics.RootMeanSquaredError; + case RegressionMetric.RSquared: + return metrics.RSquared; + default: + throw MetricsAgentUtil.BuildMetricNotSupportedException(_optimizingMetric); + } + } + + public bool IsModelPerfect(double score) + { + if (double.IsNaN(score)) + { + return false; + } + + switch (_optimizingMetric) + { + case RegressionMetric.MeanAbsoluteError: + return score == 0; + case RegressionMetric.MeanSquaredError: + return score == 0; + case RegressionMetric.RootMeanSquaredError: + return score == 0; + case RegressionMetric.RSquared: + return score == 1; + default: + throw MetricsAgentUtil.BuildMetricNotSupportedException(_optimizingMetric); + } + } + + public RegressionMetrics EvaluateMetrics(IDataView data, string labelColumn) + { + return _mlContext.Regression.Evaluate(data, labelColumn); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/ModelContainer.cs b/src/Microsoft.ML.AutoML/Experiment/ModelContainer.cs new file mode 100644 index 0000000000..8b829da7ab --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/ModelContainer.cs @@ -0,0 +1,50 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; + +namespace Microsoft.ML.AutoML +{ + internal class ModelContainer + { + private readonly MLContext _mlContext; + private readonly FileInfo _fileInfo; + private readonly ITransformer _model; + + internal ModelContainer(MLContext mlContext, ITransformer model) + { + _mlContext = mlContext; + _model = model; + } + + internal ModelContainer(MLContext mlContext, FileInfo fileInfo, ITransformer model, DataViewSchema modelInputSchema) + { + _mlContext = mlContext; + _fileInfo = fileInfo; + + // Write model to disk + using (var fs = File.Create(fileInfo.FullName)) + { + _mlContext.Model.Save(model, modelInputSchema, fs); + } + } + + public ITransformer GetModel() + { + // If model stored in memory, return it + if (_model != null) + { + return _model; + } + + // Load model from disk + ITransformer model; + using (var stream = new FileStream(_fileInfo.FullName, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + model = _mlContext.Model.Load(stream, out var modelInputSchema); + } + return model; + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/OptimizingMetricInfo.cs b/src/Microsoft.ML.AutoML/Experiment/OptimizingMetricInfo.cs new file mode 100644 index 0000000000..c6ce0ec943 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/OptimizingMetricInfo.cs @@ -0,0 +1,44 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + internal sealed class OptimizingMetricInfo + { + public bool IsMaximizing { get; } + + private static RegressionMetric[] _minimizingRegressionMetrics = new RegressionMetric[] + { + RegressionMetric.MeanAbsoluteError, + RegressionMetric.MeanSquaredError, + RegressionMetric.RootMeanSquaredError + }; + + private static BinaryClassificationMetric[] _minimizingBinaryMetrics = new BinaryClassificationMetric[] + { + }; + + private static MulticlassClassificationMetric[] _minimizingMulticlassMetrics = new MulticlassClassificationMetric[] + { + MulticlassClassificationMetric.LogLoss, + }; + + public OptimizingMetricInfo(RegressionMetric regressionMetric) + { + IsMaximizing = !_minimizingRegressionMetrics.Contains(regressionMetric); + } + + public OptimizingMetricInfo(BinaryClassificationMetric binaryMetric) + { + IsMaximizing = !_minimizingBinaryMetrics.Contains(binaryMetric); + } + + public OptimizingMetricInfo(MulticlassClassificationMetric multiMetric) + { + IsMaximizing = !_minimizingMulticlassMetrics.Contains(multiMetric); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/RecipeInference.cs b/src/Microsoft.ML.AutoML/Experiment/RecipeInference.cs new file mode 100644 index 0000000000..f2857efc28 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/RecipeInference.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; + +namespace Microsoft.ML.AutoML +{ + internal static class RecipeInference + { + /// + /// Given a predictor type, return a set of all permissible trainers (with their sweeper params, if defined). + /// + /// Array of viable learners. + public static IEnumerable AllowedTrainers(MLContext mlContext, TaskKind task, + ColumnInformation columnInfo, IEnumerable trainerWhitelist) + { + var trainerExtensions = TrainerExtensionCatalog.GetTrainers(task, trainerWhitelist); + + var trainers = new List(); + foreach (var trainerExtension in trainerExtensions) + { + var learner = new SuggestedTrainer(mlContext, trainerExtension, columnInfo); + trainers.Add(learner); + } + return trainers.ToArray(); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValRunner.cs new file mode 100644 index 0000000000..b5c8f10476 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValRunner.cs @@ -0,0 +1,75 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + internal class CrossValRunner : IRunner> + where TMetrics : class + { + private readonly MLContext _context; + private readonly IDataView[] _trainDatasets; + private readonly IDataView[] _validDatasets; + private readonly IMetricsAgent _metricsAgent; + private readonly IEstimator _preFeaturizer; + private readonly ITransformer[] _preprocessorTransforms; + private readonly string _labelColumn; + private readonly IChannel _logger; + private readonly DataViewSchema _modelInputSchema; + + public CrossValRunner(MLContext context, + IDataView[] trainDatasets, + IDataView[] validDatasets, + IMetricsAgent metricsAgent, + IEstimator preFeaturizer, + ITransformer[] preprocessorTransforms, + string labelColumn, + IChannel logger) + { + _context = context; + _trainDatasets = trainDatasets; + _validDatasets = validDatasets; + _metricsAgent = metricsAgent; + _preFeaturizer = preFeaturizer; + _preprocessorTransforms = preprocessorTransforms; + _labelColumn = labelColumn; + _logger = logger; + _modelInputSchema = trainDatasets[0].Schema; + } + + public (SuggestedPipelineRunDetail suggestedPipelineRunDetail, CrossValidationRunDetail runDetail) + Run(SuggestedPipeline pipeline, DirectoryInfo modelDirectory, int iterationNum) + { + var trainResults = new List>(); + + for (var i = 0; i < _trainDatasets.Length; i++) + { + var modelFileInfo = RunnerUtil.GetModelFileInfo(modelDirectory, iterationNum, i + 1); + var trainResult = RunnerUtil.TrainAndScorePipeline(_context, pipeline, _trainDatasets[i], _validDatasets[i], + _labelColumn, _metricsAgent, _preprocessorTransforms?[i], modelFileInfo, _modelInputSchema, _logger); + trainResults.Add(new SuggestedPipelineTrainResult(trainResult.model, trainResult.metrics, trainResult.exception, trainResult.score)); + } + + var avgScore = CalcAverageScore(trainResults.Select(r => r.Score)); + var allRunsSucceeded = trainResults.All(r => r.Exception == null); + + var suggestedPipelineRunDetail = new SuggestedPipelineCrossValRunDetail(pipeline, avgScore, allRunsSucceeded, trainResults); + var runDetail = suggestedPipelineRunDetail.ToIterationResult(_preFeaturizer); + return (suggestedPipelineRunDetail, runDetail); + } + + private static double CalcAverageScore(IEnumerable scores) + { + if (scores.Any(s => double.IsNaN(s))) + { + return double.NaN; + } + return scores.Average(); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs new file mode 100644 index 0000000000..e640174fd8 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/CrossValSummaryRunner.cs @@ -0,0 +1,102 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + internal class CrossValSummaryRunner : IRunner> + where TMetrics : class + { + private readonly MLContext _context; + private readonly IDataView[] _trainDatasets; + private readonly IDataView[] _validDatasets; + private readonly IMetricsAgent _metricsAgent; + private readonly IEstimator _preFeaturizer; + private readonly ITransformer[] _preprocessorTransforms; + private readonly string _labelColumn; + private readonly OptimizingMetricInfo _optimizingMetricInfo; + private readonly IChannel _logger; + private readonly DataViewSchema _modelInputSchema; + + public CrossValSummaryRunner(MLContext context, + IDataView[] trainDatasets, + IDataView[] validDatasets, + IMetricsAgent metricsAgent, + IEstimator preFeaturizer, + ITransformer[] preprocessorTransforms, + string labelColumn, + OptimizingMetricInfo optimizingMetricInfo, + IChannel logger) + { + _context = context; + _trainDatasets = trainDatasets; + _validDatasets = validDatasets; + _metricsAgent = metricsAgent; + _preFeaturizer = preFeaturizer; + _preprocessorTransforms = preprocessorTransforms; + _labelColumn = labelColumn; + _optimizingMetricInfo = optimizingMetricInfo; + _logger = logger; + _modelInputSchema = trainDatasets[0].Schema; + } + + public (SuggestedPipelineRunDetail suggestedPipelineRunDetail, RunDetail runDetail) + Run(SuggestedPipeline pipeline, DirectoryInfo modelDirectory, int iterationNum) + { + var trainResults = new List<(ModelContainer model, TMetrics metrics, Exception exception, double score)>(); + + for (var i = 0; i < _trainDatasets.Length; i++) + { + var modelFileInfo = RunnerUtil.GetModelFileInfo(modelDirectory, iterationNum, i + 1); + var trainResult = RunnerUtil.TrainAndScorePipeline(_context, pipeline, _trainDatasets[i], _validDatasets[i], + _labelColumn, _metricsAgent, _preprocessorTransforms?.ElementAt(i), modelFileInfo, _modelInputSchema, + _logger); + trainResults.Add(trainResult); + } + + var allRunsSucceeded = trainResults.All(r => r.exception == null); + if (!allRunsSucceeded) + { + var firstException = trainResults.First(r => r.exception != null).exception; + var errorRunDetail = new SuggestedPipelineRunDetail(pipeline, double.NaN, false, null, null, firstException); + return (errorRunDetail, errorRunDetail.ToIterationResult(_preFeaturizer)); + } + + // Get the model from the best fold + var bestFoldIndex = BestResultUtil.GetIndexOfBestScore(trainResults.Select(r => r.score), _optimizingMetricInfo.IsMaximizing); + var bestModel = trainResults.ElementAt(bestFoldIndex).model; + + // Get the metrics from the fold whose score is closest to avg of all fold scores + var avgScore = trainResults.Average(r => r.score); + var indexClosestToAvg = GetIndexClosestToAverage(trainResults.Select(r => r.score), avgScore); + var metricsClosestToAvg = trainResults[indexClosestToAvg].metrics; + + // Build result objects + var suggestedPipelineRunDetail = new SuggestedPipelineRunDetail(pipeline, avgScore, allRunsSucceeded, metricsClosestToAvg, bestModel, null); + var runDetail = suggestedPipelineRunDetail.ToIterationResult(_preFeaturizer); + return (suggestedPipelineRunDetail, runDetail); + } + + private static int GetIndexClosestToAverage(IEnumerable values, double average) + { + int avgFoldIndex = -1; + var smallestDistFromAvg = double.PositiveInfinity; + for (var i = 0; i < values.Count(); i++) + { + var distFromAvg = Math.Abs(values.ElementAt(i) - average); + if (distFromAvg < smallestDistFromAvg || smallestDistFromAvg == double.PositiveInfinity) + { + smallestDistFromAvg = distFromAvg; + avgFoldIndex = i; + } + } + return avgFoldIndex; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/IRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/IRunner.cs new file mode 100644 index 0000000000..8d417c1f45 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/IRunner.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; + +namespace Microsoft.ML.AutoML +{ + internal interface IRunner where TRunDetail : RunDetail + { + (SuggestedPipelineRunDetail suggestedPipelineRunDetail, TRunDetail runDetail) + Run (SuggestedPipeline pipeline, DirectoryInfo modelDirectory, int iterationNum); + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/RunnerUtil.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/RunnerUtil.cs new file mode 100644 index 0000000000..6fca4d78c0 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/RunnerUtil.cs @@ -0,0 +1,60 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + internal static class RunnerUtil + { + public static (ModelContainer model, TMetrics metrics, Exception exception, double score) + TrainAndScorePipeline(MLContext context, + SuggestedPipeline pipeline, + IDataView trainData, + IDataView validData, + string labelColumn, + IMetricsAgent metricsAgent, + ITransformer preprocessorTransform, + FileInfo modelFileInfo, + DataViewSchema modelInputSchema, + IChannel logger) where TMetrics : class + { + try + { + var estimator = pipeline.ToEstimator(); + var model = estimator.Fit(trainData); + + var scoredData = model.Transform(validData); + var metrics = metricsAgent.EvaluateMetrics(scoredData, labelColumn); + var score = metricsAgent.GetScore(metrics); + + if (preprocessorTransform != null) + { + model = preprocessorTransform.Append(model); + } + + // Build container for model + var modelContainer = modelFileInfo == null ? + new ModelContainer(context, model) : + new ModelContainer(context, modelFileInfo, model, modelInputSchema); + + return (modelContainer, metrics, null, score); + } + catch (Exception ex) + { + logger.Error($"Pipeline crashed: {pipeline.ToString()} . Exception: {ex}"); + return (null, null, ex, double.NaN); + } + } + + public static FileInfo GetModelFileInfo(DirectoryInfo modelDirectory, int iterationNum, int foldNum) + { + return modelDirectory == null ? + null : + new FileInfo(Path.Combine(modelDirectory.FullName, $"Model{iterationNum}_{foldNum}.zip")); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Experiment/Runners/TrainValidateRunner.cs b/src/Microsoft.ML.AutoML/Experiment/Runners/TrainValidateRunner.cs new file mode 100644 index 0000000000..d608f7dd2f --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/Runners/TrainValidateRunner.cs @@ -0,0 +1,66 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + internal class TrainValidateRunner : IRunner> + where TMetrics : class + { + private readonly MLContext _context; + private readonly IDataView _trainData; + private readonly IDataView _validData; + private readonly string _labelColumn; + private readonly IMetricsAgent _metricsAgent; + private readonly IEstimator _preFeaturizer; + private readonly ITransformer _preprocessorTransform; + private readonly IChannel _logger; + private readonly DataViewSchema _modelInputSchema; + + public TrainValidateRunner(MLContext context, + IDataView trainData, + IDataView validData, + string labelColumn, + IMetricsAgent metricsAgent, + IEstimator preFeaturizer, + ITransformer preprocessorTransform, + IChannel logger) + { + _context = context; + _trainData = trainData; + _validData = validData; + _labelColumn = labelColumn; + _metricsAgent = metricsAgent; + _preFeaturizer = preFeaturizer; + _preprocessorTransform = preprocessorTransform; + _logger = logger; + _modelInputSchema = trainData.Schema; + } + + public (SuggestedPipelineRunDetail suggestedPipelineRunDetail, RunDetail runDetail) + Run(SuggestedPipeline pipeline, DirectoryInfo modelDirectory, int iterationNum) + { + var modelFileInfo = GetModelFileInfo(modelDirectory, iterationNum); + var trainResult = RunnerUtil.TrainAndScorePipeline(_context, pipeline, _trainData, _validData, + _labelColumn, _metricsAgent, _preprocessorTransform, modelFileInfo, _modelInputSchema, _logger); + var suggestedPipelineRunDetail = new SuggestedPipelineRunDetail(pipeline, + trainResult.score, + trainResult.exception == null, + trainResult.metrics, + trainResult.model, + trainResult.exception); + var runDetail = suggestedPipelineRunDetail.ToIterationResult(_preFeaturizer); + return (suggestedPipelineRunDetail, runDetail); + } + + private static FileInfo GetModelFileInfo(DirectoryInfo modelDirectory, int iterationNum) + { + return modelDirectory == null ? + null : + new FileInfo(Path.Combine(modelDirectory.FullName, $"Model{iterationNum}.zip")); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedPipeline.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipeline.cs new file mode 100644 index 0000000000..72c314fe93 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipeline.cs @@ -0,0 +1,144 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + /// + /// A runnable pipeline. Contains a learner and set of transforms, + /// along with a RunSummary if it has already been exectued. + /// + internal class SuggestedPipeline + { + public readonly IList Transforms; + public readonly SuggestedTrainer Trainer; + public readonly IList TransformsPostTrainer; + + private readonly MLContext _context; + private readonly bool _cacheBeforeTrainer; + + public SuggestedPipeline(IEnumerable transforms, + IEnumerable transformsPostTrainer, + SuggestedTrainer trainer, + MLContext context, + bool cacheBeforeTrainer) + { + Transforms = transforms.Select(t => t.Clone()).ToList(); + TransformsPostTrainer = transformsPostTrainer.Select(t => t.Clone()).ToList(); + Trainer = trainer.Clone(); + _context = context; + _cacheBeforeTrainer = cacheBeforeTrainer; + } + + public override string ToString() => $"{string.Join(" ", Transforms.Select(t => $"xf={t}"))} tr={Trainer} {string.Join(" ", TransformsPostTrainer.Select(t => $"xf={t}"))} cache={(_cacheBeforeTrainer ? "+" : "-")}"; + + public override bool Equals(object obj) + { + var pipeline = obj as SuggestedPipeline; + if(pipeline == null) + { + return false; + } + return pipeline.ToString() == ToString(); + } + + public override int GetHashCode() + { + return ToString().GetHashCode(); + } + + public Pipeline ToPipeline() + { + var pipelineElements = new List(); + foreach(var transform in Transforms) + { + pipelineElements.Add(transform.PipelineNode); + } + pipelineElements.Add(Trainer.ToPipelineNode()); + foreach (var transform in TransformsPostTrainer) + { + pipelineElements.Add(transform.PipelineNode); + } + return new Pipeline(pipelineElements.ToArray(), _cacheBeforeTrainer); + } + + public static SuggestedPipeline FromPipeline(MLContext context, Pipeline pipeline) + { + var transforms = new List(); + var transformsPostTrainer = new List(); + SuggestedTrainer trainer = null; + + var trainerEncountered = false; + foreach(var pipelineNode in pipeline.Nodes) + { + if(pipelineNode.NodeType == PipelineNodeType.Trainer) + { + var trainerName = (TrainerName)Enum.Parse(typeof(TrainerName), pipelineNode.Name); + var trainerExtension = TrainerExtensionCatalog.GetTrainerExtension(trainerName); + var hyperParamSet = TrainerExtensionUtil.BuildParameterSet(trainerName, pipelineNode.Properties); + var columnInfo = TrainerExtensionUtil.BuildColumnInfo(pipelineNode.Properties); + trainer = new SuggestedTrainer(context, trainerExtension, columnInfo, hyperParamSet); + trainerEncountered = true; + } + else if (pipelineNode.NodeType == PipelineNodeType.Transform) + { + var estimatorName = (EstimatorName)Enum.Parse(typeof(EstimatorName), pipelineNode.Name); + var estimatorExtension = EstimatorExtensionCatalog.GetExtension(estimatorName); + var estimator = estimatorExtension.CreateInstance(context, pipelineNode); + var transform = new SuggestedTransform(pipelineNode, estimator); + if (!trainerEncountered) + { + transforms.Add(transform); + } + else + { + transformsPostTrainer.Add(transform); + } + } + } + + return new SuggestedPipeline(transforms, transformsPostTrainer, trainer, context, pipeline.CacheBeforeTrainer); + } + + public IEstimator ToEstimator() + { + IEstimator pipeline = new EstimatorChain(); + + // Append each transformer to the pipeline + foreach (var transform in Transforms) + { + if (transform.Estimator != null) + { + pipeline = pipeline.Append(transform.Estimator); + } + } + + // Get learner + var learner = Trainer.BuildTrainer(); + + if (_cacheBeforeTrainer) + { + pipeline = pipeline.AppendCacheCheckpoint(_context); + } + + // Append learner to pipeline + pipeline = pipeline.Append(learner); + + // Append each post-trainer transformer to the pipeline + foreach (var transform in TransformsPostTrainer) + { + if (transform.Estimator != null) + { + pipeline = pipeline.Append(transform.Estimator); + } + } + + return pipeline; + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineBuilder.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineBuilder.cs new file mode 100644 index 0000000000..ddbf49a364 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineBuilder.cs @@ -0,0 +1,43 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class SuggestedPipelineBuilder + { + public static SuggestedPipeline Build(MLContext context, + ICollection transforms, + ICollection transformsPostTrainer, + SuggestedTrainer trainer, + CacheBeforeTrainer cacheBeforeTrainerSettings) + { + var trainerInfo = trainer.BuildTrainer().Info; + AddNormalizationTransforms(context, trainerInfo, transforms); + var cacheBeforeTrainer = ShouldCacheBeforeTrainer(trainerInfo, cacheBeforeTrainerSettings); + return new SuggestedPipeline(transforms, transformsPostTrainer, trainer, context, cacheBeforeTrainer); + } + + private static void AddNormalizationTransforms(MLContext context, + TrainerInfo trainerInfo, + ICollection transforms) + { + // Only add normalization if trainer needs it + if (!trainerInfo.NeedNormalization) + { + return; + } + + var transform = NormalizingExtension.CreateSuggestedTransform(context, DefaultColumnNames.Features, DefaultColumnNames.Features); + transforms.Add(transform); + } + + private static bool ShouldCacheBeforeTrainer(TrainerInfo trainerInfo, CacheBeforeTrainer cacheBeforeTrainerSettings) + { + return cacheBeforeTrainerSettings == CacheBeforeTrainer.On || (cacheBeforeTrainerSettings == CacheBeforeTrainer.Auto && trainerInfo.WantCaching); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineCrossValRunDetail.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineCrossValRunDetail.cs new file mode 100644 index 0000000000..b16a9d0d43 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineCrossValRunDetail.cs @@ -0,0 +1,55 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + internal sealed class SuggestedPipelineTrainResult + { + public readonly TMetrics ValidationMetrics; + public readonly ModelContainer ModelContainer; + public readonly Exception Exception; + public readonly double Score; + + internal SuggestedPipelineTrainResult(ModelContainer modelContainer, + TMetrics metrics, + Exception exception, + double score) + { + ModelContainer = modelContainer; + ValidationMetrics = metrics; + Exception = exception; + Score = score; + } + + public TrainResult ToTrainResult() + { + return new TrainResult(ModelContainer, ValidationMetrics, Exception); + } + } + + internal sealed class SuggestedPipelineCrossValRunDetail : SuggestedPipelineRunDetail + { + public readonly IEnumerable> Results; + + internal SuggestedPipelineCrossValRunDetail(SuggestedPipeline pipeline, + double score, + bool runSucceeded, + IEnumerable> results) : base(pipeline, score, runSucceeded) + { + Results = results; + Exception = Results.Select(r => r.Exception).FirstOrDefault(e => e != null); + } + + public CrossValidationRunDetail ToIterationResult(IEstimator preFeaturizer) + { + var estimator = SuggestedPipelineRunDetailUtil.PrependPreFeaturizer(Pipeline.ToEstimator(), preFeaturizer); + return new CrossValidationRunDetail(Pipeline.Trainer.TrainerName.ToString(), estimator, + Pipeline.ToPipeline(), Results.Select(r => r.ToTrainResult())); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetail.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetail.cs new file mode 100644 index 0000000000..460369518d --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetail.cs @@ -0,0 +1,59 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.AutoML +{ + internal class SuggestedPipelineRunDetail + { + public readonly SuggestedPipeline Pipeline; + public readonly bool RunSucceded; + public readonly double Score; + + public Exception Exception { get; set; } + + public SuggestedPipelineRunDetail(SuggestedPipeline pipeline, double score, bool runSucceeded) + { + Pipeline = pipeline; + Score = score; + RunSucceded = runSucceeded; + } + + public static SuggestedPipelineRunDetail FromPipelineRunResult(MLContext context, PipelineScore pipelineRunResult) + { + return new SuggestedPipelineRunDetail(SuggestedPipeline.FromPipeline(context, pipelineRunResult.Pipeline), pipelineRunResult.Score, pipelineRunResult.RunSucceded); + } + + public IRunResult ToRunResult(bool isMetricMaximizing) + { + return new RunResult(Pipeline.Trainer.HyperParamSet, Score, isMetricMaximizing); + } + } + + internal class SuggestedPipelineRunDetail : SuggestedPipelineRunDetail + { + public readonly TMetrics ValidationMetrics; + public readonly ModelContainer ModelContainer; + + internal SuggestedPipelineRunDetail(SuggestedPipeline pipeline, + double score, + bool runSucceeded, + TMetrics validationMetrics, + ModelContainer modelContainer, + Exception ex) : base(pipeline, score, runSucceeded) + { + ValidationMetrics = validationMetrics; + ModelContainer = modelContainer; + Exception = ex; + } + + public RunDetail ToIterationResult(IEstimator preFeaturizer) + { + var estimator = SuggestedPipelineRunDetailUtil.PrependPreFeaturizer(Pipeline.ToEstimator(), preFeaturizer); + return new RunDetail(Pipeline.Trainer.TrainerName.ToString(), estimator, + Pipeline.ToPipeline(), ModelContainer, ValidationMetrics, Exception); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetailUtil.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetailUtil.cs new file mode 100644 index 0000000000..61230d4862 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/SuggestedPipelineRunDetails/SuggestedPipelineRunDetailUtil.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal static class SuggestedPipelineRunDetailUtil + { + public static IEstimator PrependPreFeaturizer(IEstimator estimator, IEstimator preFeaturizer) + { + if (preFeaturizer == null) + { + return estimator; + } + return preFeaturizer.Append(estimator); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Experiment/SuggestedTrainer.cs b/src/Microsoft.ML.AutoML/Experiment/SuggestedTrainer.cs new file mode 100644 index 0000000000..155d7ba101 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Experiment/SuggestedTrainer.cs @@ -0,0 +1,92 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Trainers; + +namespace Microsoft.ML.AutoML +{ + internal class SuggestedTrainer + { + public IEnumerable SweepParams { get; } + public TrainerName TrainerName { get; } + public ParameterSet HyperParamSet { get; set; } + + private readonly MLContext _mlContext; + private readonly ITrainerExtension _trainerExtension; + private readonly ColumnInformation _columnInfo; + + internal SuggestedTrainer(MLContext mlContext, ITrainerExtension trainerExtension, + ColumnInformation columnInfo, + ParameterSet hyperParamSet = null) + { + _mlContext = mlContext; + _trainerExtension = trainerExtension; + _columnInfo = columnInfo; + SweepParams = _trainerExtension.GetHyperparamSweepRanges(); + TrainerName = TrainerExtensionCatalog.GetTrainerName(_trainerExtension); + SetHyperparamValues(hyperParamSet); + } + + public void SetHyperparamValues(ParameterSet hyperParamSet) + { + HyperParamSet = hyperParamSet; + PropagateParamSetValues(); + } + + public SuggestedTrainer Clone() + { + return new SuggestedTrainer(_mlContext, _trainerExtension, _columnInfo, HyperParamSet?.Clone()); + } + + public ITrainerEstimator, object> BuildTrainer() + { + IEnumerable sweepParams = null; + if (HyperParamSet != null) + { + sweepParams = SweepParams; + } + return _trainerExtension.CreateInstance(_mlContext, sweepParams, _columnInfo); + } + + public override string ToString() + { + var paramsStr = string.Empty; + if (SweepParams != null) + { + paramsStr = string.Join(", ", SweepParams.Where(p => p != null && p.RawValue != null).Select(p => $"{p.Name}:{p.ProcessedValue()}")); + } + return $"{TrainerName}{{{paramsStr}}}"; + } + + public PipelineNode ToPipelineNode() + { + var sweepParams = SweepParams.Where(p => p.RawValue != null); + return _trainerExtension.CreatePipelineNode(sweepParams, _columnInfo); + } + + /// + /// make sure sweep params and param set are consistent + /// + private void PropagateParamSetValues() + { + if (HyperParamSet == null) + { + return; + } + + var spMap = SweepParams.ToDictionary(sp => sp.Name); + + foreach (var hp in HyperParamSet) + { + if (spMap.ContainsKey(hp.Name)) + { + var sp = spMap[hp.Name]; + sp.SetUsingValueText(hp.ValueText); + } + } + } + } +} diff --git a/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj new file mode 100644 index 0000000000..ffbcb43ea2 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Microsoft.ML.AutoML.csproj @@ -0,0 +1,16 @@ + + + netstandard2.0 + Microsoft.ML.AutoML + + + + + + + + + + + + diff --git a/src/Microsoft.ML.AutoML/PipelineSuggesters/PipelineSuggester.cs b/src/Microsoft.ML.AutoML/PipelineSuggesters/PipelineSuggester.cs new file mode 100644 index 0000000000..dce6f53f6a --- /dev/null +++ b/src/Microsoft.ML.AutoML/PipelineSuggesters/PipelineSuggester.cs @@ -0,0 +1,217 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class PipelineSuggester + { + private const int TopKTrainers = 3; + + public static Pipeline GetNextPipeline(MLContext context, + IEnumerable history, + DatasetColumnInfo[] columns, + TaskKind task, + bool isMaximizingMetric = true) + { + var inferredHistory = history.Select(r => SuggestedPipelineRunDetail.FromPipelineRunResult(context, r)); + var nextInferredPipeline = GetNextInferredPipeline(context, inferredHistory, columns, task, isMaximizingMetric, CacheBeforeTrainer.Auto); + return nextInferredPipeline?.ToPipeline(); + } + + public static SuggestedPipeline GetNextInferredPipeline(MLContext context, + IEnumerable history, + DatasetColumnInfo[] columns, + TaskKind task, + bool isMaximizingMetric, + CacheBeforeTrainer cacheBeforeTrainer, + IEnumerable trainerWhitelist = null) + { + var availableTrainers = RecipeInference.AllowedTrainers(context, task, + ColumnInformationUtil.BuildColumnInfo(columns), trainerWhitelist); + var transforms = TransformInferenceApi.InferTransforms(context, task, columns).ToList(); + var transformsPostTrainer = TransformInferenceApi.InferTransformsPostTrainer(context, task, columns).ToList(); + + // if we haven't run all pipelines once + if (history.Count() < availableTrainers.Count()) + { + return GetNextFirstStagePipeline(context, history, availableTrainers, transforms, transformsPostTrainer, cacheBeforeTrainer); + } + + // get top trainers from stage 1 runs + var topTrainers = GetTopTrainers(history, availableTrainers, isMaximizingMetric); + + // sort top trainers by # of times they've been run, from lowest to highest + var orderedTopTrainers = OrderTrainersByNumTrials(history, topTrainers); + + // keep as hashset of previously visited pipelines + var visitedPipelines = new HashSet(history.Select(h => h.Pipeline)); + + // iterate over top trainers (from least run to most run), + // to find next pipeline + foreach (var trainer in orderedTopTrainers) + { + var newTrainer = trainer.Clone(); + + // repeat until passes or runs out of chances + const int maxNumberAttempts = 10; + var count = 0; + do + { + // sample new hyperparameters for the learner + if (!SampleHyperparameters(context, newTrainer, history, isMaximizingMetric)) + { + // if unable to sample new hyperparameters for the learner + // (ie SMAC returned 0 suggestions), break + break; + } + + var suggestedPipeline = SuggestedPipelineBuilder.Build(context, transforms, transformsPostTrainer, newTrainer, cacheBeforeTrainer); + + // make sure we have not seen pipeline before + if (!visitedPipelines.Contains(suggestedPipeline)) + { + return suggestedPipeline; + } + } while (++count <= maxNumberAttempts); + } + + return null; + } + + /// + /// Get top trainers from first stage + /// + private static IEnumerable GetTopTrainers(IEnumerable history, + IEnumerable availableTrainers, + bool isMaximizingMetric) + { + // narrow history to first stage runs that succeeded + history = history.Take(availableTrainers.Count()).Where(x => x.RunSucceded); + + history = history.GroupBy(r => r.Pipeline.Trainer.TrainerName).Select(g => g.First()); + IEnumerable sortedHistory = history.OrderBy(r => r.Score); + if(isMaximizingMetric) + { + sortedHistory = sortedHistory.Reverse(); + } + var topTrainers = sortedHistory.Take(TopKTrainers).Select(r => r.Pipeline.Trainer); + return topTrainers; + } + + private static IEnumerable OrderTrainersByNumTrials(IEnumerable history, + IEnumerable selectedTrainers) + { + var selectedTrainerNames = new HashSet(selectedTrainers.Select(t => t.TrainerName)); + return history.Where(h => selectedTrainerNames.Contains(h.Pipeline.Trainer.TrainerName)) + .GroupBy(h => h.Pipeline.Trainer.TrainerName) + .OrderBy(x => x.Count()) + .Select(x => x.First().Pipeline.Trainer); + } + + private static SuggestedPipeline GetNextFirstStagePipeline(MLContext context, + IEnumerable history, + IEnumerable availableTrainers, + ICollection transforms, + ICollection transformsPostTrainer, + CacheBeforeTrainer cacheBeforeTrainer) + { + var trainer = availableTrainers.ElementAt(history.Count()); + return SuggestedPipelineBuilder.Build(context, transforms, transformsPostTrainer, trainer, cacheBeforeTrainer); + } + + private static IValueGenerator[] ConvertToValueGenerators(IEnumerable hps) + { + var results = new IValueGenerator[hps.Count()]; + + for (int i = 0; i < hps.Count(); i++) + { + switch (hps.ElementAt(i)) + { + case SweepableDiscreteParam dp: + var dpArgs = new DiscreteParamArguments() + { + Name = dp.Name, + Values = dp.Options.Select(o => o.ToString()).ToArray() + }; + results[i] = new DiscreteValueGenerator(dpArgs); + break; + + case SweepableFloatParam fp: + var fpArgs = new FloatParamArguments() + { + Name = fp.Name, + Min = fp.Min, + Max = fp.Max, + LogBase = fp.IsLogScale, + }; + if (fp.NumSteps.HasValue) + { + fpArgs.NumSteps = fp.NumSteps.Value; + } + if (fp.StepSize.HasValue) + { + fpArgs.StepSize = fp.StepSize.Value; + } + results[i] = new FloatValueGenerator(fpArgs); + break; + + case SweepableLongParam lp: + var lpArgs = new LongParamArguments() + { + Name = lp.Name, + Min = lp.Min, + Max = lp.Max, + LogBase = lp.IsLogScale + }; + if (lp.NumSteps.HasValue) + { + lpArgs.NumSteps = lp.NumSteps.Value; + } + if (lp.StepSize.HasValue) + { + lpArgs.StepSize = lp.StepSize.Value; + } + results[i] = new LongValueGenerator(lpArgs); + break; + } + } + return results; + } + + /// + /// Samples new hyperparameters for the trainer, and sets them. + /// Returns true if success (new hyperparams were suggested and set). Else, returns false. + /// + private static bool SampleHyperparameters(MLContext context, SuggestedTrainer trainer, IEnumerable history, bool isMaximizingMetric) + { + var sps = ConvertToValueGenerators(trainer.SweepParams); + var sweeper = new SmacSweeper(context, + new SmacSweeper.Arguments + { + SweptParameters = sps + }); + + IEnumerable historyToUse = history + .Where(r => r.RunSucceded && r.Pipeline.Trainer.TrainerName == trainer.TrainerName && r.Pipeline.Trainer.HyperParamSet != null && r.Pipeline.Trainer.HyperParamSet.Any()); + + // get new set of hyperparameter values + var proposedParamSet = sweeper.ProposeSweeps(1, historyToUse.Select(h => h.ToRunResult(isMaximizingMetric))).First(); + if(!proposedParamSet.Any()) + { + return false; + } + + // associate proposed param set with trainer, so that smart hyperparam + // sweepers (like KDO) can map them back. + trainer.SetHyperparamValues(proposedParamSet); + + return true; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Sweepers/ISweeper.cs b/src/Microsoft.ML.AutoML/Sweepers/ISweeper.cs new file mode 100644 index 0000000000..004e9a6f08 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Sweepers/ISweeper.cs @@ -0,0 +1,273 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Internal.Utilities; + +namespace Microsoft.ML.AutoML +{ + /// + /// The main interface of the sweeper + /// + internal interface ISweeper + { + /// + /// Returns between 0 and maxSweeps configurations to run. + /// It expects a list of previous runs such that it can generate configurations that were not already tried. + /// The list of runs can be null if there were no previous runs. + /// Some smart sweepers can take advantage of the metric(s) that the caller computes for previous runs. + /// + ParameterSet[] ProposeSweeps(int maxSweeps, IEnumerable previousRuns = null); + } + + /// + /// This is the interface that each type of parameter sweep needs to implement + /// + internal interface IValueGenerator + { + /// + /// Given a value in the [0,1] range, return a value for this parameter. + /// + IParameterValue CreateFromNormalized(Double normalizedValue); + + /// + /// Used mainly in grid sweepers, return the i-th distinct value for this parameter + /// + IParameterValue this[int i] { get; } + + /// + /// Used mainly in grid sweepers, return the count of distinct values for this parameter + /// + int Count { get; } + + /// + /// Returns the name of the generated parameter + /// + string Name { get; } + } + + /// + /// Parameter value generated from the sweeping. + /// The parameter values must be immutable. + /// Value is converted to string because the runner will usually want to construct a command line for TL. + /// Implementations of this interface must also override object.GetHashCode() and object.Equals(object) so they are consistent + /// with IEquatable.Equals(IParameterValue). + /// + internal interface IParameterValue : IEquatable + { + string Name { get; } + string ValueText { get; } + } + + /// + /// Type safe version of the IParameterValue interface. + /// + internal interface IParameterValue : IParameterValue + { + TValue Value { get; } + } + + /// + /// A set of parameter values. + /// The parameter set must be immutable. + /// + internal sealed class ParameterSet : IEquatable, IEnumerable + { + private readonly Dictionary _parameterValues; + private readonly int _hash; + + public ParameterSet(IEnumerable parameters) + { + _parameterValues = new Dictionary(); + foreach (var parameter in parameters) + { + _parameterValues.Add(parameter.Name, parameter); + } + + var parameterNames = _parameterValues.Keys.ToList(); + parameterNames.Sort(); + _hash = 0; + foreach (var parameterName in parameterNames) + { + _hash = Hashing.CombineHash(_hash, _parameterValues[parameterName].GetHashCode()); + } + } + + public ParameterSet(Dictionary paramValues, int hash) + { + _parameterValues = paramValues; + _hash = hash; + } + + public IEnumerator GetEnumerator() + { + return _parameterValues.Values.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + public int Count + { + get { return _parameterValues.Count; } + } + + public IParameterValue this[string name] + { + get { return _parameterValues[name]; } + } + + private bool ContainsParamValue(IParameterValue parameterValue) + { + IParameterValue value; + return _parameterValues.TryGetValue(parameterValue.Name, out value) && + parameterValue.Equals(value); + } + + public bool Equals(ParameterSet other) + { + if (other == null || other._hash != _hash || other._parameterValues.Count != _parameterValues.Count) + return false; + return other._parameterValues.Values.All(pv => ContainsParamValue(pv)); + } + + public ParameterSet Clone() => + new ParameterSet(new Dictionary(_parameterValues), _hash); + + public override string ToString() + { + return string.Join(" ", _parameterValues.Select(kvp => string.Format("{0}={1}", kvp.Value.Name, kvp.Value.ValueText)).ToArray()); + } + + public override int GetHashCode() + { + return _hash; + } + } + + /// + /// The result of a run. + /// Contains the parameter set used, useful for the sweeper to not generate the same configuration multiple times. + /// Also contains the result of a run and the metric value that is used by smart sweepers to generate new configurations + /// that try to maximize this metric. + /// + internal interface IRunResult : IComparable + { + ParameterSet ParameterSet { get; } + IComparable MetricValue { get; } + bool IsMetricMaximizing { get; } + } + + internal interface IRunResult : IRunResult + where T : IComparable + { + new T MetricValue { get; } + } + + /// + /// Simple implementation of IRunResult + /// + internal sealed class RunResult : IRunResult + { + private readonly ParameterSet _parameterSet; + private readonly Double? _metricValue; + private readonly bool _isMetricMaximizing; + + /// + /// This switch changes the behavior of the CompareTo function, switching the greater than / less than + /// behavior, depending on if it is set to True. + /// + public bool IsMetricMaximizing { get { return _isMetricMaximizing; } } + + public ParameterSet ParameterSet + { + get { return _parameterSet; } + } + + public RunResult(ParameterSet parameterSet, Double metricValue, bool isMetricMaximizing) + { + _parameterSet = parameterSet; + _metricValue = metricValue; + _isMetricMaximizing = isMetricMaximizing; + } + + public Double MetricValue + { + get + { + return _metricValue.Value; + } + } + + public int CompareTo(IRunResult other) + { + var otherTyped = other as RunResult; + //Contracts.Check(otherTyped != null); + if (_metricValue == otherTyped._metricValue) + return 0; + return _isMetricMaximizing ^ (_metricValue < otherTyped._metricValue) ? 1 : -1; + } + + public bool HasMetricValue + { + get + { + return _metricValue != null; + } + } + + IComparable IRunResult.MetricValue + { + get { return MetricValue; } + } + } + + /// + /// The metric class, used by smart sweeping algorithms. + /// Ideally we would like to move towards the new IDataView/ISchematized, this is + /// just a simple view instead, and it is decoupled from RunResult so we can move + /// in that direction in the future. + /// + internal sealed class RunMetric + { + private readonly float _primaryMetric; + private readonly float[] _metricDistribution; + + public RunMetric(float primaryMetric, IEnumerable metricDistribution = null) + { + _primaryMetric = primaryMetric; + if (metricDistribution != null) + _metricDistribution = metricDistribution.ToArray(); + } + + /// + /// The primary metric to optimize. + /// This metric is usually an aggregate value for the run, for example, AUC, accuracy etc. + /// By default, smart sweeping algorithms will maximize this metric. + /// If you want to minimize, either negate this value or change the option in the arguments of the sweeper constructor. + /// + public float PrimaryMetric + { + get { return _primaryMetric; } + } + + /// + /// The (optional) distribution of the metric. + /// This distribution can be a secondary measure of how good a run was, e.g per-fold AUC, per-fold accuracy, (sampled) per-instance log loss etc. + /// + public float[] GetMetricDistribution() + { + if (_metricDistribution == null) + return null; + var result = new float[_metricDistribution.Length]; + Array.Copy(_metricDistribution, result, _metricDistribution.Length); + return result; + } + } +} diff --git a/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs b/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs new file mode 100644 index 0000000000..77916f3926 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Sweepers/Parameters.cs @@ -0,0 +1,481 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Internal.Utilities; + +namespace Microsoft.ML.AutoML +{ + internal abstract class BaseParamArguments + { + // Parameter name + public string Name; + } + + internal abstract class NumericParamArguments : BaseParamArguments + { + // Number of steps for grid runthrough. + public int NumSteps; + + // Amount of increment between steps (multiplicative if log). + public Double? StepSize; + + // Log scale. + public bool LogBase; + + public NumericParamArguments() + { + NumSteps = 100; + StepSize = null; + LogBase = false; + } + } + + internal class FloatParamArguments : NumericParamArguments + { + // Minimum value + public float Min; + + // Maximum value + public float Max; + } + + internal class LongParamArguments : NumericParamArguments + { + // Minimum value + public long Min; + + // Maximum value + public long Max; + } + + internal class DiscreteParamArguments : BaseParamArguments + { + // Values + public string[] Values; + } + + internal sealed class LongParameterValue : IParameterValue + { + private readonly string _name; + private readonly string _valueText; + private readonly long _value; + + public string Name + { + get { return _name; } + } + + public string ValueText + { + get { return _valueText; } + } + + public long Value + { + get { return _value; } + } + + public LongParameterValue(string name, long value) + { + _name = name; + _value = value; + _valueText = _value.ToString("D"); + } + + public bool Equals(IParameterValue other) + { + return Equals((object)other); + } + + public override bool Equals(object obj) + { + var lpv = obj as LongParameterValue; + return lpv != null && Name == lpv.Name && _value == lpv._value; + } + + public override int GetHashCode() + { + return Hashing.CombinedHash(0, typeof(LongParameterValue), _name, _value); + } + } + + internal sealed class FloatParameterValue : IParameterValue + { + private readonly string _name; + private readonly string _valueText; + private readonly float _value; + + public string Name + { + get { return _name; } + } + + public string ValueText + { + get { return _valueText; } + } + + public float Value + { + get { return _value; } + } + + public FloatParameterValue(string name, float value) + { + Runtime.Contracts.Assert(!float.IsNaN(value)); + _name = name; + _value = value; + _valueText = _value.ToString("R"); + } + + public bool Equals(IParameterValue other) + { + return Equals((object)other); + } + + public override bool Equals(object obj) + { + var fpv = obj as FloatParameterValue; + return fpv != null && Name == fpv.Name && _value == fpv._value; + } + + public override int GetHashCode() + { + return Hashing.CombinedHash(0, typeof(FloatParameterValue), _name, _value); + } + } + + internal sealed class StringParameterValue : IParameterValue + { + private readonly string _name; + private readonly string _value; + + public string Name + { + get { return _name; } + } + + public string ValueText + { + get { return _value; } + } + + public string Value + { + get { return _value; } + } + + public StringParameterValue(string name, string value) + { + _name = name; + _value = value; + } + + public bool Equals(IParameterValue other) + { + return Equals((object)other); + } + + public override bool Equals(object obj) + { + var spv = obj as StringParameterValue; + return spv != null && Name == spv.Name && ValueText == spv.ValueText; + } + + public override int GetHashCode() + { + return Hashing.CombinedHash(0, typeof(StringParameterValue), _name, _value); + } + } + + internal interface INumericValueGenerator : IValueGenerator + { + float NormalizeValue(IParameterValue value); + bool InRange(IParameterValue value); + } + + /// + /// The integer type parameter sweep. + /// + internal class LongValueGenerator : INumericValueGenerator + { + private readonly LongParamArguments _args; + private IParameterValue[] _gridValues; + + public string Name { get { return _args.Name; } } + + public LongValueGenerator(LongParamArguments args) + { + Runtime.Contracts.Assert(args.Min < args.Max, "min must be less than max"); + // REVIEW: this condition can be relaxed if we change the math below to deal with it + Runtime.Contracts.Assert(!args.LogBase || args.Min > 0, "min must be positive if log scale is used"); + Runtime.Contracts.Assert(!args.LogBase || args.StepSize == null || args.StepSize > 1, "StepSize must be greater than 1 if log scale is used"); + Runtime.Contracts.Assert(args.LogBase || args.StepSize == null || args.StepSize > 0, "StepSize must be greater than 0 if linear scale is used"); + _args = args; + } + + // REVIEW: Is Float accurate enough? + public IParameterValue CreateFromNormalized(Double normalizedValue) + { + long val; + if (_args.LogBase) + { + // REVIEW: review the math below, it only works for positive Min and Max + var logBase = !_args.StepSize.HasValue + ? Math.Pow(1.0 * _args.Max / _args.Min, 1.0 / (_args.NumSteps - 1)) + : _args.StepSize.Value; + var logMax = Math.Log(_args.Max, logBase); + var logMin = Math.Log(_args.Min, logBase); + val = (long)(_args.Min * Math.Pow(logBase, normalizedValue * (logMax - logMin))); + } + else + val = (long)(_args.Min + normalizedValue * (_args.Max - _args.Min)); + + return new LongParameterValue(_args.Name, val); + } + + private void EnsureParameterValues() + { + if (_gridValues != null) + return; + + var result = new List(); + if ((_args.StepSize == null && _args.NumSteps > (_args.Max - _args.Min)) || + (_args.StepSize != null && _args.StepSize <= 1)) + { + for (long i = _args.Min; i <= _args.Max; i++) + result.Add(new LongParameterValue(_args.Name, i)); + } + else + { + if (_args.LogBase) + { + // REVIEW: review the math below, it only works for positive Min and Max + var logBase = _args.StepSize ?? Math.Pow(1.0 * _args.Max / _args.Min, 1.0 / (_args.NumSteps - 1)); + + long prevValue = long.MinValue; + var maxPlusEpsilon = _args.Max * Math.Sqrt(logBase); + for (Double value = _args.Min; value <= maxPlusEpsilon; value *= logBase) + { + var longValue = (long)value; + if (longValue > prevValue) + result.Add(new LongParameterValue(_args.Name, longValue)); + prevValue = longValue; + } + } + else + { + var stepSize = _args.StepSize ?? (Double)(_args.Max - _args.Min) / (_args.NumSteps - 1); + long prevValue = long.MinValue; + var maxPlusEpsilon = _args.Max + stepSize / 2; + for (Double value = _args.Min; value <= maxPlusEpsilon; value += stepSize) + { + var longValue = (long)value; + if (longValue > prevValue) + result.Add(new LongParameterValue(_args.Name, longValue)); + prevValue = longValue; + } + } + } + _gridValues = result.ToArray(); + } + + public IParameterValue this[int i] + { + get + { + EnsureParameterValues(); + return _gridValues[i]; + } + } + + public int Count + { + get + { + EnsureParameterValues(); + return _gridValues.Length; + } + } + + public float NormalizeValue(IParameterValue value) + { + var valueTyped = value as LongParameterValue; + Runtime.Contracts.Assert(valueTyped != null, "LongValueGenerator could not normalized parameter because it is not of the correct type"); + Runtime.Contracts.Assert(_args.Min <= valueTyped.Value && valueTyped.Value <= _args.Max, "Value not in correct range"); + + if (_args.LogBase) + { + float logBase = (float)(_args.StepSize ?? Math.Pow(1.0 * _args.Max / _args.Min, 1.0 / (_args.NumSteps - 1))); + return (float)((Math.Log(valueTyped.Value, logBase) - Math.Log(_args.Min, logBase)) / (Math.Log(_args.Max, logBase) - Math.Log(_args.Min, logBase))); + } + else + return (float)(valueTyped.Value - _args.Min) / (_args.Max - _args.Min); + } + + public bool InRange(IParameterValue value) + { + var valueTyped = value as LongParameterValue; + return (_args.Min <= valueTyped.Value && valueTyped.Value <= _args.Max); + } + } + + /// + /// The floating point type parameter sweep. + /// + internal class FloatValueGenerator : INumericValueGenerator + { + private readonly FloatParamArguments _args; + private IParameterValue[] _gridValues; + + public string Name { get { return _args.Name; } } + + public FloatValueGenerator(FloatParamArguments args) + { + Runtime.Contracts.Assert(args.Min < args.Max, "min must be less than max"); + // REVIEW: this condition can be relaxed if we change the math below to deal with it + Runtime.Contracts.Assert(!args.LogBase || args.Min > 0, "min must be positive if log scale is used"); + Runtime.Contracts.Assert(!args.LogBase || args.StepSize == null || args.StepSize > 1, "StepSize must be greater than 1 if log scale is used"); + Runtime.Contracts.Assert(args.LogBase || args.StepSize == null || args.StepSize > 0, "StepSize must be greater than 0 if linear scale is used"); + _args = args; + } + + // REVIEW: Is Float accurate enough? + public IParameterValue CreateFromNormalized(Double normalizedValue) + { + float val; + if (_args.LogBase) + { + // REVIEW: review the math below, it only works for positive Min and Max + var logBase = !_args.StepSize.HasValue + ? Math.Pow(1.0 * _args.Max / _args.Min, 1.0 / (_args.NumSteps - 1)) + : _args.StepSize.Value; + var logMax = Math.Log(_args.Max, logBase); + var logMin = Math.Log(_args.Min, logBase); + val = (float)(_args.Min * Math.Pow(logBase, normalizedValue * (logMax - logMin))); + } + else + val = (float)(_args.Min + normalizedValue * (_args.Max - _args.Min)); + + return new FloatParameterValue(_args.Name, val); + } + + private void EnsureParameterValues() + { + if (_gridValues != null) + return; + + var result = new List(); + if (_args.LogBase) + { + // REVIEW: review the math below, it only works for positive Min and Max + var logBase = _args.StepSize ?? Math.Pow(1.0 * _args.Max / _args.Min, 1.0 / (_args.NumSteps - 1)); + + float prevValue = float.NegativeInfinity; + var maxPlusEpsilon = _args.Max * Math.Sqrt(logBase); + for (Double value = _args.Min; value <= maxPlusEpsilon; value *= logBase) + { + var floatValue = (float)value; + if (floatValue > prevValue) + result.Add(new FloatParameterValue(_args.Name, floatValue)); + prevValue = floatValue; + } + } + else + { + var stepSize = _args.StepSize ?? (Double)(_args.Max - _args.Min) / (_args.NumSteps - 1); + float prevValue = float.NegativeInfinity; + var maxPlusEpsilon = _args.Max + stepSize / 2; + for (Double value = _args.Min; value <= maxPlusEpsilon; value += stepSize) + { + var floatValue = (float)value; + if (floatValue > prevValue) + result.Add(new FloatParameterValue(_args.Name, floatValue)); + prevValue = floatValue; + } + } + + _gridValues = result.ToArray(); + } + + public IParameterValue this[int i] + { + get + { + EnsureParameterValues(); + return _gridValues[i]; + } + } + + public int Count + { + get + { + EnsureParameterValues(); + return _gridValues.Length; + } + } + + public float NormalizeValue(IParameterValue value) + { + var valueTyped = value as FloatParameterValue; + Runtime.Contracts.Assert(valueTyped != null, "FloatValueGenerator could not normalized parameter because it is not of the correct type"); + Runtime.Contracts.Assert(_args.Min <= valueTyped.Value && valueTyped.Value <= _args.Max, "Value not in correct range"); + + if (_args.LogBase) + { + float logBase = (float)(_args.StepSize ?? Math.Pow(1.0 * _args.Max / _args.Min, 1.0 / (_args.NumSteps - 1))); + return (float)((Math.Log(valueTyped.Value, logBase) - Math.Log(_args.Min, logBase)) / (Math.Log(_args.Max, logBase) - Math.Log(_args.Min, logBase))); + } + else + return (valueTyped.Value - _args.Min) / (_args.Max - _args.Min); + } + + public bool InRange(IParameterValue value) + { + var valueTyped = value as FloatParameterValue; + Runtime.Contracts.Assert(valueTyped != null, "Parameter should be of type FloatParameterValue"); + return (_args.Min <= valueTyped.Value && valueTyped.Value <= _args.Max); + } + } + + /// + /// The discrete parameter sweep. + /// + internal class DiscreteValueGenerator : IValueGenerator + { + private readonly DiscreteParamArguments _args; + + public string Name { get { return _args.Name; } } + + public DiscreteValueGenerator(DiscreteParamArguments args) + { + _args = args; + } + + // REVIEW: Is Float accurate enough? + public IParameterValue CreateFromNormalized(Double normalizedValue) + { + return new StringParameterValue(_args.Name, _args.Values[(int)(_args.Values.Length * normalizedValue)]); + } + + public IParameterValue this[int i] + { + get + { + return new StringParameterValue(_args.Name, _args.Values[i]); + } + } + + public int Count + { + get + { + return _args.Values.Length; + } + } + } +} diff --git a/src/Microsoft.ML.AutoML/Sweepers/Random.cs b/src/Microsoft.ML.AutoML/Sweepers/Random.cs new file mode 100644 index 0000000000..c045436b23 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Sweepers/Random.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + /// + /// Random sweeper, it generates random values for each of the parameters. + /// + internal sealed class UniformRandomSweeper : SweeperBase + { + public UniformRandomSweeper(ArgumentsBase args) + : base(args, "UniformRandom") + { + } + + public UniformRandomSweeper(ArgumentsBase args, IValueGenerator[] sweepParameters) + : base(args, sweepParameters, "UniformRandom") + { + } + + protected override ParameterSet CreateParamSet() + { + return new ParameterSet(SweepParameters.Select(sweepParameter => sweepParameter.CreateFromNormalized(AutoMlUtils.Random.Value.NextDouble()))); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Sweepers/SmacSweeper.cs b/src/Microsoft.ML.AutoML/Sweepers/SmacSweeper.cs new file mode 100644 index 0000000000..02e82e1cee --- /dev/null +++ b/src/Microsoft.ML.AutoML/Sweepers/SmacSweeper.cs @@ -0,0 +1,436 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Reflection; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; +using Float = System.Single; + +namespace Microsoft.ML.AutoML +{ + //REVIEW: Figure out better way to do this. could introduce a base class for all smart sweepers, + //encapsulating common functionality. This seems like a good plan to persue. + internal sealed class SmacSweeper : ISweeper + { + public sealed class Arguments + { + // Swept parameters + public IValueGenerator[] SweptParameters; + + // Seed for the random number generator for the first batch sweeper + public int RandomSeed; + + // If iteration point is outside parameter definitions, should it be projected? + public bool ProjectInBounds; + + // Number of regression trees in forest + public int NumOfTrees; + + // Minimum number of data points required to be in a node if it is to be split further + public int NMinForSplit; + + // Number of points to use for random initialization + public int NumberInitialPopulation; + + // Number of search parents to use for local search in maximizing EI acquisition function + public int LocalSearchParentCount; + + // Number of random configurations when maximizing EI acquisition function + public int NumRandomEISearchConfigurations; + + // Fraction of eligible dimensions to split on (i.e., split ratio) + public Float SplitRatio; + + // Epsilon threshold for ending local searches + public Float Epsilon; + + // Number of neighbors to sample for locally searching each numerical parameter + public int NumNeighborsForNumericalParams; + + public Arguments() + { + ProjectInBounds = true; + NumOfTrees = 10; + NMinForSplit = 2; + NumberInitialPopulation = 20; + LocalSearchParentCount = 10; + NumRandomEISearchConfigurations = 10000; + SplitRatio = 0.8f; + Epsilon = 0.00001f; + NumNeighborsForNumericalParams = 4; + } + } + + private readonly ISweeper _randomSweeper; + private readonly Arguments _args; + private readonly MLContext _context; + + private readonly IValueGenerator[] _sweepParameters; + + public SmacSweeper(MLContext context, Arguments args) + { + _context = context; + _args = args; + _sweepParameters = args.SweptParameters; + _randomSweeper = new UniformRandomSweeper(new SweeperBase.ArgumentsBase(), _sweepParameters); + } + + public ParameterSet[] ProposeSweeps(int maxSweeps, IEnumerable previousRuns = null) + { + int numOfCandidates = maxSweeps; + + // Initialization: Will enter here on first iteration and use the default (random) + // sweeper to generate initial candidates. + int numRuns = previousRuns == null ? 0 : previousRuns.Count(); + if (numRuns < _args.NumberInitialPopulation) + return _randomSweeper.ProposeSweeps(Math.Min(numOfCandidates, _args.NumberInitialPopulation - numRuns), previousRuns); + + // Only retain viable runs + List viableRuns = new List(); + foreach (RunResult run in previousRuns) + { + if (run != null && run.HasMetricValue) + viableRuns.Add(run); + } + + // Fit Random Forest Model on previous run data. + var forestPredictor = FitModel(viableRuns); + + // Using acquisition function and current best, get candidate configuration(s). + return GenerateCandidateConfigurations(numOfCandidates, viableRuns, forestPredictor); + } + + private FastForestRegressionModelParameters FitModel(IEnumerable previousRuns) + { + Single[] targets = new Single[previousRuns.Count()]; + Single[][] features = new Single[previousRuns.Count()][]; + + int i = 0; + foreach (RunResult r in previousRuns) + { + features[i] = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, r.ParameterSet, true); + targets[i] = (Float)r.MetricValue; + i++; + } + + ArrayDataViewBuilder dvBuilder = new ArrayDataViewBuilder(_context); + dvBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, targets); + dvBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single, features); + + IDataView data = dvBuilder.GetDataView(); + Runtime.Contracts.Assert(data.GetRowCount() == targets.Length, "This data view will have as many rows as there have been evaluations"); + + // Set relevant random forest arguments. + // Train random forest. + var trainer = _context.Regression.Trainers.FastForest(new FastForestRegressionTrainer.Options() + { + FeatureFraction = _args.SplitRatio, + NumberOfTrees = _args.NumOfTrees, + MinimumExampleCountPerLeaf = _args.NMinForSplit + }); + var predictor = trainer.Fit(data).Model; + + // Return random forest predictor. + return predictor; + } + + /// + /// Generates a set of candidate configurations to sweep through, based on a combination of random and local + /// search, as outlined in Hutter et al - Sequential Model-Based Optimization for General Algorithm Configuration. + /// Makes use of class private members which determine how many candidates are returned. This number will include + /// random configurations interleaved (per the paper), and thus will be double the specified value. + /// + /// Number of candidate solutions to return. + /// History of previously evaluated points, with their emprical performance values. + /// Trained random forest ensemble. Used in evaluating the candidates. + /// An array of ParamaterSets which are the candidate configurations to sweep. + private ParameterSet[] GenerateCandidateConfigurations(int numOfCandidates, IEnumerable previousRuns, FastForestRegressionModelParameters forest) + { + // Get k best previous runs ParameterSets. + ParameterSet[] bestKParamSets = GetKBestConfigurations(previousRuns, _args.LocalSearchParentCount); + + // Perform local searches using the k best previous run configurations. + ParameterSet[] eiChallengers = GreedyPlusRandomSearch(bestKParamSets, forest, (int)Math.Ceiling(numOfCandidates / 2.0F), previousRuns); + + // Generate another set of random configurations to interleave. + ParameterSet[] randomChallengers = _randomSweeper.ProposeSweeps(numOfCandidates - eiChallengers.Length, previousRuns); + + // Return interleaved challenger candidates with random candidates. Since the number of candidates from either can be less than + // the number asked for, since we only generate unique candidates, and the number from either method may vary considerably. + ParameterSet[] configs = new ParameterSet[eiChallengers.Length + randomChallengers.Length]; + Array.Copy(eiChallengers, 0, configs, 0, eiChallengers.Length); + Array.Copy(randomChallengers, 0, configs, eiChallengers.Length, randomChallengers.Length); + + return configs; + } + + /// + /// Does a mix of greedy local search around best performing parameter sets, while throwing random parameter sets into the mix. + /// + /// Beginning locations for local greedy search. + /// Trained random forest, used later for evaluating parameters. + /// Number of candidate configurations returned by the method (top K). + /// Historical run results. + /// Array of parameter sets, which will then be evaluated. + private ParameterSet[] GreedyPlusRandomSearch(ParameterSet[] parents, FastForestRegressionModelParameters forest, int numOfCandidates, IEnumerable previousRuns) + { + RunResult bestRun = (RunResult)previousRuns.Max(); + RunResult worstRun = (RunResult)previousRuns.Min(); + double bestVal = bestRun.MetricValue; + + HashSet> configurations = new HashSet>(); + + // Perform local search. + foreach (ParameterSet c in parents) + { + Tuple bestChildKvp = LocalSearch(c, forest, bestVal, _args.Epsilon, bestRun.IsMetricMaximizing); + configurations.Add(bestChildKvp); + } + + // Additional set of random configurations to choose from during local search. + ParameterSet[] randomConfigs = _randomSweeper.ProposeSweeps(_args.NumRandomEISearchConfigurations, previousRuns); + double[] randomEIs = EvaluateConfigurationsByEI(forest, bestVal, randomConfigs, bestRun.IsMetricMaximizing); + Runtime.Contracts.Assert(randomConfigs.Length == randomEIs.Length); + + for (int i = 0; i < randomConfigs.Length; i++) + configurations.Add(new Tuple(randomEIs[i], randomConfigs[i])); + + IOrderedEnumerable> bestConfigurations = configurations.OrderByDescending(x => x.Item1); + + var retainedConfigs = new HashSet(bestConfigurations.Select(x => x.Item2)); + + // remove configurations matching previous run + foreach (var previousRun in previousRuns) + { + retainedConfigs.Remove(previousRun.ParameterSet); + } + + return retainedConfigs.Take(numOfCandidates).ToArray(); + } + + /// + /// Performs a local one-mutation neighborhood greedy search. + /// + /// Starting parameter set configuration. + /// Trained forest, for evaluation of points. + /// Best performance seen thus far. + /// Threshold for when to stop the local search. + /// Whether SMAC should aim to maximize (vs minimize) metric. + /// + private Tuple LocalSearch(ParameterSet parent, FastForestRegressionModelParameters forest, double bestVal, double epsilon, bool isMetricMaximizing) + { + try + { + double currentBestEI = EvaluateConfigurationsByEI(forest, bestVal, new ParameterSet[] { parent }, isMetricMaximizing)[0]; + ParameterSet currentBestConfig = parent; + + for (; ; ) + { + ParameterSet[] neighborhood = GetOneMutationNeighborhood(currentBestConfig); + double[] eis = EvaluateConfigurationsByEI(forest, bestVal, neighborhood, isMetricMaximizing); + int bestIndex = eis.ArgMax(); + if (eis[bestIndex] - currentBestEI < _args.Epsilon) + break; + else + { + currentBestConfig = neighborhood[bestIndex]; + currentBestEI = eis[bestIndex]; + } + } + + return new Tuple(currentBestEI, currentBestConfig); + } + catch (Exception e) + { + throw new InvalidOperationException("SMAC sweeper localSearch threw exception", e); + } + } + + /// + /// Computes a single-mutation neighborhood (one param at a time) for a given configuration. For + /// numeric parameters, samples K mutations (i.e., creates K neighbors based on that paramater). + /// + /// Starting configuration. + /// A set of configurations that each differ from parent in exactly one parameter. + private ParameterSet[] GetOneMutationNeighborhood(ParameterSet parent) + { + List neighbors = new List(); + SweeperProbabilityUtils spu = new SweeperProbabilityUtils(); + + for (int i = 0; i < _sweepParameters.Length; i++) + { + // This allows us to query possible values of this parameter. + IValueGenerator sweepParam = _sweepParameters[i]; + + // This holds the actual value for this parameter, chosen in this parameter set. + IParameterValue pset = parent[sweepParam.Name]; + + Runtime.Contracts.Assert(pset != null); + + DiscreteValueGenerator parameterDiscrete = sweepParam as DiscreteValueGenerator; + if (parameterDiscrete != null) + { + // Create one neighbor for every discrete parameter. + Float[] neighbor = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, parent, false); + + int hotIndex = -1; + for (int j = 0; j < parameterDiscrete.Count; j++) + { + if (parameterDiscrete[j].Equals(pset)) + { + hotIndex = j; + break; + } + } + + Runtime.Contracts.Assert(hotIndex >= 0); + + Random r = new Random(); + int randomIndex = r.Next(0, parameterDiscrete.Count - 1); + randomIndex += randomIndex >= hotIndex ? 1 : 0; + neighbor[i] = randomIndex; + neighbors.Add(SweeperProbabilityUtils.FloatArrayAsParameterSet(_sweepParameters, neighbor, false)); + } + else + { + INumericValueGenerator parameterNumeric = sweepParam as INumericValueGenerator; + Runtime.Contracts.Assert(parameterNumeric != null, "SMAC sweeper can only sweep over discrete and numeric parameters"); + + // Create k neighbors (typically 4) for every numerical parameter. + for (int j = 0; j < _args.NumNeighborsForNumericalParams; j++) + { + Float[] neigh = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, parent, false); + double newVal = spu.NormalRVs(1, neigh[i], 0.2)[0]; + while (newVal <= 0.0 || newVal >= 1.0) + newVal = spu.NormalRVs(1, neigh[i], 0.2)[0]; + neigh[i] = (Float)newVal; + ParameterSet neighbor = SweeperProbabilityUtils.FloatArrayAsParameterSet(_sweepParameters, neigh, false); + neighbors.Add(neighbor); + } + } + } + return neighbors.ToArray(); + } + + /// + /// Goes through forest to extract the set of leaf values associated with filtering each configuration. + /// + /// Trained forest predictor, used for filtering configs. + /// Parameter configurations. + /// 2D array where rows correspond to configurations, and columns to the predicted leaf values. + private double[][] GetForestRegressionLeafValues(FastForestRegressionModelParameters forest, ParameterSet[] configs) + { + List datasetLeafValues = new List(); + foreach (ParameterSet config in configs) + { + List leafValues = new List(); + for (var treeId = 0; treeId < forest.TrainedTreeEnsemble.Trees.Count; treeId++) + { + Float[] transformedParams = SweeperProbabilityUtils.ParameterSetAsFloatArray(_sweepParameters, config, true); + VBuffer features = new VBuffer(transformedParams.Length, transformedParams); + var leafId = GetLeaf(forest, treeId, features); + var leafValue = GetLeafValue(forest, treeId, leafId); + leafValues.Add(leafValue); + } + datasetLeafValues.Add(leafValues.ToArray()); + } + return datasetLeafValues.ToArray(); + } + + // Todo: Remove the reflection below for TreeTreeEnsembleModelParameters methods GetLeaf and GetLeafValue. + // Long-term, replace with tree featurizer once it becomes available + // Tracking issue -- https://github.com/dotnet/machinelearning-automl/issues/342 + private static MethodInfo _getLeafMethod = typeof(TreeEnsembleModelParameters).GetMethod("GetLeaf", BindingFlags.NonPublic | BindingFlags.Instance); + private static MethodInfo _getLeafValueMethod = typeof(TreeEnsembleModelParameters).GetMethod("GetLeafValue", BindingFlags.NonPublic | BindingFlags.Instance); + + private static int GetLeaf(TreeEnsembleModelParameters model, int treeId, VBuffer features) + { + List path = null; + return (int)_getLeafMethod.Invoke(model, new object[] { treeId, features, path }); + } + + private static float GetLeafValue(TreeEnsembleModelParameters model, int treeId, int leafId) + { + return (float)_getLeafValueMethod.Invoke(model, new object[] { treeId, leafId }); + } + + /// + /// Computes the empirical means and standard deviations for trees in the forest for each configuration. + /// + /// The sets of leaf values from which the means and standard deviations are computed. + /// A 2D array with one row per set of tree values, and the columns being mean and stddev, respectively. + private double[][] ComputeForestStats(double[][] leafValues) + { + // Computes the empirical mean and empirical std dev from the leaf prediction values. + double[][] meansAndStdDevs = new double[leafValues.Length][]; + for (int i = 0; i < leafValues.Length; i++) + { + double[] row = new double[2]; + row[0] = VectorUtils.GetMean(leafValues[i]); + row[1] = VectorUtils.GetStandardDeviation(leafValues[i]); + meansAndStdDevs[i] = row; + } + return meansAndStdDevs; + } + + private double[] EvaluateConfigurationsByEI(FastForestRegressionModelParameters forest, double bestVal, ParameterSet[] configs, bool isMetricMaximizing) + { + double[][] leafPredictions = GetForestRegressionLeafValues(forest, configs); + double[][] forestStatistics = ComputeForestStats(leafPredictions); + return ComputeEIs(bestVal, forestStatistics, isMetricMaximizing); + } + + private ParameterSet[] GetKBestConfigurations(IEnumerable previousRuns, int k = 10) + { + // NOTE: Should we change this to rank according to EI (using forest), instead of observed performance? + + SortedSet bestK = new SortedSet(); + + foreach (RunResult r in previousRuns) + { + RunResult worst = bestK.Min(); + + if (bestK.Count < k || r.CompareTo(worst) > 0) + bestK.Add(r); + + if (bestK.Count > k) + bestK.Remove(worst); + } + + // Extract the ParamaterSets and return. + List outSet = new List(); + foreach (RunResult r in bestK) + outSet.Add(r.ParameterSet); + return outSet.ToArray(); + } + + private double ComputeEI(double bestVal, double[] forestStatistics, bool isMetricMaximizing) + { + double empMean = forestStatistics[0]; + double empStdDev = forestStatistics[1]; + double centered = empMean - bestVal; + if (!isMetricMaximizing) + { + centered *= -1; + } + if (empStdDev == 0) + { + return centered; + } + double ztrans = centered / empStdDev; + return centered * SweeperProbabilityUtils.StdNormalCdf(ztrans) + empStdDev * SweeperProbabilityUtils.StdNormalPdf(ztrans); + } + + private double[] ComputeEIs(double bestVal, double[][] forestStatistics, bool isMetricMaximizing) + { + double[] eis = new double[forestStatistics.Length]; + for (int i = 0; i < forestStatistics.Length; i++) + eis[i] = ComputeEI(bestVal, forestStatistics[i], isMetricMaximizing); + return eis; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Sweepers/SweeperBase.cs b/src/Microsoft.ML.AutoML/Sweepers/SweeperBase.cs new file mode 100644 index 0000000000..0d7b9aa5c9 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Sweepers/SweeperBase.cs @@ -0,0 +1,78 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + /// + /// Signature for the GUI loaders of sweepers. + /// + internal delegate void SignatureSweeperFromParameterList(IValueGenerator[] sweepParameters); + + /// + /// Base sweeper that ensures the suggestions are different from each other and from the previous runs. + /// + internal abstract class SweeperBase : ISweeper + { + internal class ArgumentsBase + { + public IValueGenerator[] SweptParameters; + + // Number of tries to generate distinct parameter sets. + public int Retries; + + public ArgumentsBase() + { + Retries = 10; + } + } + + private readonly ArgumentsBase _args; + protected readonly IValueGenerator[] SweepParameters; + + protected SweeperBase(ArgumentsBase args, string name) + { + _args = args; + + SweepParameters = args.SweptParameters.ToArray(); + } + + protected SweeperBase(ArgumentsBase args, IValueGenerator[] sweepParameters, string name) + { + _args = args; + SweepParameters = sweepParameters; + } + + public virtual ParameterSet[] ProposeSweeps(int maxSweeps, IEnumerable previousRuns = null) + { + var prevParamSets = new HashSet(previousRuns?.Select(r => r.ParameterSet).ToList() ?? new List()); + var result = new HashSet(); + for (int i = 0; i < maxSweeps; i++) + { + ParameterSet paramSet; + int retries = 0; + do + { + paramSet = CreateParamSet(); + ++retries; + } while (paramSet != null && retries < _args.Retries && + (AlreadyGenerated(paramSet, prevParamSets) || AlreadyGenerated(paramSet, result))); + + Runtime.Contracts.Assert(paramSet != null); + result.Add(paramSet); + } + + return result.ToArray(); + } + + protected abstract ParameterSet CreateParamSet(); + + protected static bool AlreadyGenerated(ParameterSet paramSet, ISet previousRuns) + { + return previousRuns.Contains(paramSet); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs b/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs new file mode 100644 index 0000000000..f848a4e27b --- /dev/null +++ b/src/Microsoft.ML.AutoML/Sweepers/SweeperProbabilityUtils.cs @@ -0,0 +1,160 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Internal.CpuMath; + +namespace Microsoft.ML.AutoML +{ + internal sealed class SweeperProbabilityUtils + { + public static double StdNormalPdf(double x) + { + return 1 / Math.Sqrt(2 * Math.PI) * Math.Exp(-Math.Pow(x, 2) / 2); + } + + public static double StdNormalCdf(double x) + { + return 0.5 * (1 + ProbabilityFunctions.Erf(x * 1 / Math.Sqrt(2))); + } + + /// + /// Samples from a Gaussian Normal with mean mu and std dev sigma. + /// + /// Number of samples + /// mean + /// standard deviation + /// + public double[] NormalRVs(int numRVs, double mu, double sigma) + { + List rvs = new List(); + double u1; + double u2; + + for (int i = 0; i < numRVs; i++) + { + u1 = AutoMlUtils.Random.Value.NextDouble(); + u2 = AutoMlUtils.Random.Value.NextDouble(); + rvs.Add(mu + sigma * Math.Sqrt(-2.0 * Math.Log(u1)) * Math.Sin(2.0 * Math.PI * u2)); + } + + return rvs.ToArray(); + } + + /// + /// Simple binary search method for finding smallest index in array where value + /// meets or exceeds what you're looking for. + /// + /// Array to search + /// Value to search for + /// Left boundary of search + /// Right boundary of search + /// + private int BinarySearch(double[] a, double u, int low, int high) + { + int diff = high - low; + if (diff < 2) + return a[low] >= u ? low : high; + int mid = low + (diff / 2); + return a[mid] >= u ? BinarySearch(a, u, low, mid) : BinarySearch(a, u, mid, high); + } + + public static float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, ParameterSet ps, bool expandCategoricals = true) + { + Runtime.Contracts.Assert(ps.Count == sweepParams.Length); + + var result = new List(); + + for (int i = 0; i < sweepParams.Length; i++) + { + // This allows us to query possible values of this parameter. + var sweepParam = sweepParams[i]; + + // This holds the actual value for this parameter, chosen in this parameter set. + var pset = ps[sweepParam.Name]; + Runtime.Contracts.Assert(pset != null); + + var parameterDiscrete = sweepParam as DiscreteValueGenerator; + if (parameterDiscrete != null) + { + int hotIndex = -1; + for (int j = 0; j < parameterDiscrete.Count; j++) + { + if (parameterDiscrete[j].Equals(pset)) + { + hotIndex = j; + break; + } + } + Runtime.Contracts.Assert(hotIndex >= 0); + + if (expandCategoricals) + for (int j = 0; j < parameterDiscrete.Count; j++) + result.Add(j == hotIndex ? 1 : 0); + else + result.Add(hotIndex); + } + else if (sweepParam is LongValueGenerator lvg) + { + // Normalizing all numeric parameters to [0,1] range. + result.Add(lvg.NormalizeValue(new LongParameterValue(pset.Name, long.Parse(pset.ValueText)))); + } + else if (sweepParam is FloatValueGenerator fvg) + { + // Normalizing all numeric parameters to [0,1] range. + result.Add(fvg.NormalizeValue(new FloatParameterValue(pset.Name, float.Parse(pset.ValueText)))); + } + else + { + throw new InvalidOperationException("Smart sweeper can only sweep over discrete and numeric parameters"); + } + } + + return result.ToArray(); + } + + public static ParameterSet FloatArrayAsParameterSet(IValueGenerator[] sweepParams, float[] array, bool expandedCategoricals = true) + { + Runtime.Contracts.Assert(array.Length == sweepParams.Length); + + List parameters = new List(); + int currentArrayIndex = 0; + for (int i = 0; i < sweepParams.Length; i++) + { + var parameterDiscrete = sweepParams[i] as DiscreteValueGenerator; + if (parameterDiscrete != null) + { + if (expandedCategoricals) + { + int hotIndex = -1; + for (int j = 0; j < parameterDiscrete.Count; j++) + { + if (array[i + j] > 0) + { + hotIndex = j; + break; + } + } + Runtime.Contracts.Assert(hotIndex >= i); + parameters.Add(new StringParameterValue(sweepParams[i].Name, parameterDiscrete[hotIndex].ValueText)); + currentArrayIndex += parameterDiscrete.Count; + } + else + { + parameters.Add(new StringParameterValue(sweepParams[i].Name, parameterDiscrete[(int)array[currentArrayIndex]].ValueText)); + currentArrayIndex++; + } + } + else + { + parameters.Add(sweepParams[i].CreateFromNormalized(array[currentArrayIndex])); + currentArrayIndex++; + } + } + + return new ParameterSet(parameters); + } + } +} diff --git a/src/Microsoft.ML.AutoML/TaskKind.cs b/src/Microsoft.ML.AutoML/TaskKind.cs new file mode 100644 index 0000000000..3370ba624f --- /dev/null +++ b/src/Microsoft.ML.AutoML/TaskKind.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal enum TaskKind + { + BinaryClassification, + MulticlassClassification, + Regression, + } +} diff --git a/src/Microsoft.ML.AutoML/Terminators/IterationBasedTerminator.cs b/src/Microsoft.ML.AutoML/Terminators/IterationBasedTerminator.cs new file mode 100644 index 0000000000..16741a65d2 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Terminators/IterationBasedTerminator.cs @@ -0,0 +1,26 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.AutoML +{ + internal sealed class IterationBasedTerminator + { + private readonly int _numTotalIterations; + + public IterationBasedTerminator(int numTotalIterations) + { + _numTotalIterations = numTotalIterations; + } + + public bool ShouldTerminate(int numPreviousIterations) + { + return numPreviousIterations >= _numTotalIterations; + } + + public int RemainingIterations(int numPreviousIterations) + { + return _numTotalIterations - numPreviousIterations; + } + } +} diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/BinaryTrainerExtensions.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/BinaryTrainerExtensions.cs new file mode 100644 index 0000000000..b68a436218 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/BinaryTrainerExtensions.cs @@ -0,0 +1,235 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Calibrators; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + using ITrainerEstimator = ITrainerEstimator, object>; + + internal class AveragedPerceptronBinaryExtension : ITrainerExtension + { + private const int DefaultNumIterations = 10; + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildAveragePerceptronParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + AveragedPerceptronTrainer.Options options = null; + if (sweepParams == null || !sweepParams.Any()) + { + options = new AveragedPerceptronTrainer.Options(); + options.NumberOfIterations = DefaultNumIterations; + options.LabelColumnName = columnInfo.LabelColumnName; + } + else + { + options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + if (!sweepParams.Any(p => p.Name == "NumberOfIterations")) + { + options.NumberOfIterations = DefaultNumIterations; + } + } + return mlContext.BinaryClassification.Trainers.AveragedPerceptron(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + Dictionary additionalProperties = null; + + if (sweepParams == null || !sweepParams.Any(p => p.Name != "NumberOfIterations")) + { + additionalProperties = new Dictionary() + { + { "NumberOfIterations", DefaultNumIterations } + }; + } + + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, additionalProperties: additionalProperties); + } + } + + internal class FastForestBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastForestParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.BinaryClassification.Trainers.FastForest(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class FastTreeBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastTreeParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.BinaryClassification.Trainers.FastTree(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class LightGbmBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLightGbmParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + LightGbmBinaryTrainer.Options options = TrainerExtensionUtil.CreateLightGbmOptions>, CalibratedModelParametersBase>(sweepParams, columnInfo); + return mlContext.BinaryClassification.Trainers.LightGbm(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildLightGbmPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class LinearSvmBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLinearSvmParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + return mlContext.BinaryClassification.Trainers.LinearSvm(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName); + } + } + + internal class SdcaLogisticRegressionBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildSdcaParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + return mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName); + } + } + + internal class LbfgsLogisticRegressionBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLbfgsLogisticRegressionParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class SgdCalibratedBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildSgdParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.BinaryClassification.Trainers.SgdCalibrated(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class SymbolicSgdLogisticRegressionBinaryExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildSymSgdLogisticRegressionParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + return mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName); + } + } +} diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/ITrainerExtension.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/ITrainerExtension.cs new file mode 100644 index 0000000000..40a6c9e1f7 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/ITrainerExtension.cs @@ -0,0 +1,20 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Trainers; + +namespace Microsoft.ML.AutoML +{ + using ITrainerEstimator = ITrainerEstimator, object>; + + internal interface ITrainerExtension + { + IEnumerable GetHyperparamSweepRanges(); + + ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, ColumnInformation columnInfo); + + PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo); + } +} diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/MultiTrainerExtensions.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/MultiTrainerExtensions.cs new file mode 100644 index 0000000000..543cb5b147 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/MultiTrainerExtensions.cs @@ -0,0 +1,232 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + using ITrainerEstimator = ITrainerEstimator, object>; + + internal class AveragedPerceptronOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new AveragedPerceptronBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildAveragePerceptronParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as AveragedPerceptronTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class FastForestOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new FastForestBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastForestParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as FastForestBinaryTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class LightGbmMultiExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLightGbmParamsMulticlass(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + LightGbmMulticlassTrainer.Options options = TrainerExtensionUtil.CreateLightGbmOptions, MulticlassPredictionTransformer, OneVersusAllModelParameters>(sweepParams, columnInfo); + return mlContext.MulticlassClassification.Trainers.LightGbm(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildLightGbmPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class LinearSvmOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new LinearSvmBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLinearSvmParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as LinearSvmTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class SdcaMaximumEntropyMultiExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildSdcaParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + return mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName); + } + } + + internal class LbfgsLogisticRegressionOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new LbfgsLogisticRegressionBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLbfgsLogisticRegressionParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as LbfgsLogisticRegressionBinaryTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class SgdCalibratedOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new SgdCalibratedBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildSgdParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as SgdCalibratedTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class SymbolicSgdLogisticRegressionOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new SymbolicSgdLogisticRegressionBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return _binaryLearnerCatalogItem.GetHyperparamSweepRanges(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as SymbolicSgdLogisticRegressionBinaryTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class FastTreeOvaExtension : ITrainerExtension + { + private static readonly ITrainerExtension _binaryLearnerCatalogItem = new FastTreeBinaryExtension(); + + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastTreeParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var binaryTrainer = _binaryLearnerCatalogItem.CreateInstance(mlContext, sweepParams, columnInfo) as FastTreeBinaryTrainer; + return mlContext.MulticlassClassification.Trainers.OneVersusAll(binaryTrainer, labelColumnName: columnInfo.LabelColumnName); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildOvaPipelineNode(this, _binaryLearnerCatalogItem, sweepParams, columnInfo); + } + } + + internal class LbfgsMaximumEntropyMultiExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLbfgsLogisticRegressionParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/RegressionTrainerExtensions.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/RegressionTrainerExtensions.cs new file mode 100644 index 0000000000..795aa23ac1 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/RegressionTrainerExtensions.cs @@ -0,0 +1,187 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + using ITrainerEstimator = ITrainerEstimator, object>; + + internal class FastForestRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastForestParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.Regression.Trainers.FastForest(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class FastTreeRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastTreeParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.Regression.Trainers.FastTree(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class FastTreeTweedieRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildFastTreeTweedieParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.Regression.Trainers.FastTreeTweedie(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class LightGbmRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLightGbmParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + LightGbmRegressionTrainer.Options options = TrainerExtensionUtil.CreateLightGbmOptions, LightGbmRegressionModelParameters>(sweepParams, columnInfo); + return mlContext.Regression.Trainers.LightGbm(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildLightGbmPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class OnlineGradientDescentRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildOnlineGradientDescentParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + return mlContext.Regression.Trainers.OnlineGradientDescent(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName); + } + } + + internal class OlsRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildOlsParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.Regression.Trainers.Ols(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class LbfgsPoissonRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildLbfgsPoissonRegressionParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + return mlContext.Regression.Trainers.LbfgsPoissonRegression(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName, columnInfo.ExampleWeightColumnName); + } + } + + internal class SdcaRegressionExtension : ITrainerExtension + { + public IEnumerable GetHyperparamSweepRanges() + { + return SweepableParams.BuildSdcaParams(); + } + + public ITrainerEstimator CreateInstance(MLContext mlContext, IEnumerable sweepParams, + ColumnInformation columnInfo) + { + var options = TrainerExtensionUtil.CreateOptions(sweepParams, columnInfo.LabelColumnName); + return mlContext.Regression.Trainers.Sdca(options); + } + + public PipelineNode CreatePipelineNode(IEnumerable sweepParams, ColumnInformation columnInfo) + { + return TrainerExtensionUtil.BuildPipelineNode(TrainerExtensionCatalog.GetTrainerName(this), sweepParams, + columnInfo.LabelColumnName); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/SweepableParams.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/SweepableParams.cs new file mode 100644 index 0000000000..5f23729a77 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/SweepableParams.cs @@ -0,0 +1,205 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + internal static class SweepableParams + { + private static IEnumerable BuildAveragedLinearArgsParams() + { + return new SweepableParam[] + { + new SweepableDiscreteParam("LearningRate", new object[] { 0.01f, 0.1f, 0.5f, 1.0f}), + new SweepableDiscreteParam("DecreaseLearningRate", new object[] { false, true }), + new SweepableFloatParam("L2Regularization", 0.0f, 0.4f), + }; + } + + private static IEnumerable BuildOnlineLinearArgsParams() + { + return new SweepableParam[] + { + new SweepableLongParam("NumberOfIterations", 1, 100, stepSize: 10, isLogScale: true), + new SweepableFloatParam("InitialWeightsDiameter", 0.0f, 1.0f, numSteps: 5), + new SweepableDiscreteParam("Shuffle", new object[] { false, true }), + }; + } + + private static IEnumerable BuildTreeArgsParams() + { + return new SweepableParam[] + { + new SweepableLongParam("NumberOfLeaves", 2, 128, isLogScale: true, stepSize: 4), + new SweepableDiscreteParam("MinimumExampleCountPerLeaf", new object[] { 1, 10, 50 }), + new SweepableDiscreteParam("NumberOfTrees", new object[] { 20, 100, 500 }), + }; + } + + private static IEnumerable BuildBoostedTreeArgsParams() + { + return BuildTreeArgsParams().Concat(new List() + { + new SweepableFloatParam("LearningRate", 0.025f, 0.4f, isLogScale: true), + new SweepableFloatParam("Shrinkage", 0.025f, 4f, isLogScale: true), + }); + } + + private static IEnumerable BuildLbfgsArgsParams() + { + return new SweepableParam[] { + new SweepableFloatParam("L2Regularization", 0.0f, 1.0f, numSteps: 4), + new SweepableFloatParam("L1Regularization", 0.0f, 1.0f, numSteps: 4), + new SweepableDiscreteParam("OptimizationTolerance", new object[] { 1e-4f, 1e-7f }), + new SweepableDiscreteParam("HistorySize", new object[] { 5, 20, 50 }), + new SweepableLongParam("MaximumNumberOfIterations", 1, int.MaxValue), + new SweepableFloatParam("InitialWeightsDiameter", 0.0f, 1.0f, numSteps: 5), + new SweepableDiscreteParam("DenseOptimizer", new object[] { false, true }), + }; + } + + /// + /// The names of every hyperparameter swept across all trainers. + /// + public static ISet AllHyperparameterNames = GetAllSweepableParameterNames(); + + public static IEnumerable BuildAveragePerceptronParams() + { + return BuildAveragedLinearArgsParams().Concat(BuildOnlineLinearArgsParams()); + } + + public static IEnumerable BuildFastForestParams() + { + return BuildTreeArgsParams(); + } + + public static IEnumerable BuildFastTreeParams() + { + return BuildBoostedTreeArgsParams(); + } + + public static IEnumerable BuildFastTreeTweedieParams() + { + return BuildBoostedTreeArgsParams(); + } + + public static IEnumerable BuildLightGbmParamsMulticlass() + { + return BuildLightGbmParams().Union(new SweepableParam[] + { + new SweepableDiscreteParam("UseSoftmax", new object[] { true, false }), + }); + } + + public static IEnumerable BuildLightGbmParams() + { + return new SweepableParam[] + { + new SweepableDiscreteParam("NumberOfIterations", new object[] { 10, 20, 50, 100, 150, 200 }), + new SweepableFloatParam("LearningRate", 0.025f, 0.4f, isLogScale: true), + new SweepableLongParam("NumberOfLeaves", 2, 128, isLogScale: true, stepSize: 4), + new SweepableDiscreteParam("MinimumExampleCountPerLeaf", new object[] { 1, 10, 20, 50 }), + new SweepableDiscreteParam("UseCategoricalSplit", new object[] { true, false }), + new SweepableDiscreteParam("HandleMissingValue", new object[] { true, false }), + new SweepableDiscreteParam("MinimumExampleCountPerGroup", new object[] { 10, 50, 100, 200 }), + new SweepableDiscreteParam("MaximumCategoricalSplitPointCount", new object[] { 8, 16, 32, 64 }), + new SweepableDiscreteParam("CategoricalSmoothing", new object[] { 1, 10, 20 }), + new SweepableDiscreteParam("L2CategoricalRegularization", new object[] { 0.1, 0.5, 1, 5, 10 }), + + // Booster params + new SweepableDiscreteParam("L2Regularization", new object[] { 0f, 0.5f, 1f }), + new SweepableDiscreteParam("L1Regularization", new object[] { 0f, 0.5f, 1f }) + }; + } + + public static IEnumerable BuildLinearSvmParams() + { + return new SweepableParam[] { + new SweepableFloatParam("Lambda", 0.00001f, 0.1f, 10, isLogScale: true), + new SweepableDiscreteParam("PerformProjection", null, isBool: true), + new SweepableDiscreteParam("NoBias", null, isBool: true) + }.Concat(BuildOnlineLinearArgsParams()); + } + + public static IEnumerable BuildLbfgsLogisticRegressionParams() + { + return BuildLbfgsArgsParams(); + } + + public static IEnumerable BuildOnlineGradientDescentParams() + { + return BuildAveragedLinearArgsParams(); + } + + public static IEnumerable BuildLbfgsPoissonRegressionParams() + { + return BuildLbfgsArgsParams(); + } + + public static IEnumerable BuildSdcaParams() + { + return new SweepableParam[] { + new SweepableDiscreteParam("L2Regularization", new object[] { "", 1e-7f, 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f }), + new SweepableDiscreteParam("L1Regularization", new object[] { "", 0f, 0.25f, 0.5f, 0.75f, 1f }), + new SweepableDiscreteParam("ConvergenceTolerance", new object[] { 0.001f, 0.01f, 0.1f, 0.2f }), + new SweepableDiscreteParam("MaximumNumberOfIterations", new object[] { "", 10, 20, 100 }), + new SweepableDiscreteParam("Shuffle", null, isBool: true), + new SweepableDiscreteParam("BiasLearningRate", new object[] { 0.0f, 0.01f, 0.1f, 1f }) + }; + } + + public static IEnumerable BuildOlsParams() + { + return new SweepableParam[] { + new SweepableDiscreteParam("L2Regularization", new object[] { 1e-6f, 0.1f, 1f }) + }; + } + + public static IEnumerable BuildSgdParams() + { + return new SweepableParam[] { + new SweepableDiscreteParam("L2Regularization", new object[] { 1e-7f, 5e-7f, 1e-6f, 5e-6f, 1e-5f }), + new SweepableDiscreteParam("ConvergenceTolerance", new object[] { 1e-2f, 1e-3f, 1e-4f, 1e-5f }), + new SweepableDiscreteParam("NumberOfIterations", new object[] { 1, 5, 10, 20 }), + new SweepableDiscreteParam("Shuffle", null, isBool: true), + }; + } + + public static IEnumerable BuildSymSgdLogisticRegressionParams() + { + return new SweepableParam[] { + new SweepableDiscreteParam("NumberOfIterations", new object[] { 1, 5, 10, 20, 30, 40, 50 }), + new SweepableDiscreteParam("LearningRate", new object[] { "", 1e1f, 1e0f, 1e-1f, 1e-2f, 1e-3f }), + new SweepableDiscreteParam("L2Regularization", new object[] { 0.0f, 1e-5f, 1e-5f, 1e-6f, 1e-7f }), + new SweepableDiscreteParam("UpdateFrequency", new object[] { "", 5, 20 }) + }; + } + + /// + /// Gets the name of every hyperparameter swept across all trainers. + /// + public static ISet GetAllSweepableParameterNames() + { + var sweepableParams = new List(); + sweepableParams.AddRange(BuildAveragePerceptronParams()); + sweepableParams.AddRange(BuildAveragePerceptronParams()); + sweepableParams.AddRange(BuildFastForestParams()); + sweepableParams.AddRange(BuildFastTreeParams()); + sweepableParams.AddRange(BuildFastTreeTweedieParams()); + sweepableParams.AddRange(BuildLightGbmParamsMulticlass()); + sweepableParams.AddRange(BuildLightGbmParams()); + sweepableParams.AddRange(BuildLinearSvmParams()); + sweepableParams.AddRange(BuildLbfgsLogisticRegressionParams()); + sweepableParams.AddRange(BuildOnlineGradientDescentParams()); + sweepableParams.AddRange(BuildLbfgsPoissonRegressionParams()); + sweepableParams.AddRange(BuildSdcaParams()); + sweepableParams.AddRange(BuildOlsParams()); + sweepableParams.AddRange(BuildSgdParams()); + sweepableParams.AddRange(BuildSymSgdLogisticRegressionParams()); + return new HashSet(sweepableParams.Select(p => p.Name)); + } + } +} diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionCatalog.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionCatalog.cs new file mode 100644 index 0000000000..487112e393 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionCatalog.cs @@ -0,0 +1,138 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + internal class TrainerExtensionCatalog + { + private static readonly IDictionary _trainerNamesToExtensionTypes = + new Dictionary() + { + { TrainerName.AveragedPerceptronBinary, typeof(AveragedPerceptronBinaryExtension) }, + { TrainerName.AveragedPerceptronOva, typeof(AveragedPerceptronOvaExtension) }, + { TrainerName.FastForestBinary, typeof(FastForestBinaryExtension) }, + { TrainerName.FastForestOva, typeof(FastForestOvaExtension) }, + { TrainerName.FastForestRegression, typeof(FastForestRegressionExtension) }, + { TrainerName.FastTreeBinary, typeof(FastTreeBinaryExtension) }, + { TrainerName.FastTreeOva, typeof(FastTreeOvaExtension) }, + { TrainerName.FastTreeRegression, typeof(FastTreeRegressionExtension) }, + { TrainerName.FastTreeTweedieRegression, typeof(FastTreeTweedieRegressionExtension) }, + { TrainerName.LightGbmBinary, typeof(LightGbmBinaryExtension) }, + { TrainerName.LightGbmMulti, typeof(LightGbmMultiExtension) }, + { TrainerName.LightGbmRegression, typeof(LightGbmRegressionExtension) }, + { TrainerName.LinearSvmBinary, typeof(LinearSvmBinaryExtension) }, + { TrainerName.LinearSvmOva, typeof(LinearSvmOvaExtension) }, + { TrainerName.LbfgsLogisticRegressionBinary, typeof(LbfgsLogisticRegressionBinaryExtension) }, + { TrainerName.LbfgsMaximumEntropyMulti, typeof(LbfgsMaximumEntropyMultiExtension) }, + { TrainerName.LbfgsLogisticRegressionOva, typeof(LbfgsLogisticRegressionOvaExtension) }, + { TrainerName.OnlineGradientDescentRegression, typeof(OnlineGradientDescentRegressionExtension) }, + { TrainerName.OlsRegression, typeof(OlsRegressionExtension) }, + { TrainerName.LbfgsPoissonRegression, typeof(LbfgsPoissonRegressionExtension) }, + { TrainerName.SdcaLogisticRegressionBinary, typeof(SdcaLogisticRegressionBinaryExtension) }, + { TrainerName.SdcaMaximumEntropyMulti, typeof(SdcaMaximumEntropyMultiExtension) }, + { TrainerName.SdcaRegression, typeof(SdcaRegressionExtension) }, + { TrainerName.SgdCalibratedBinary, typeof(SgdCalibratedBinaryExtension) }, + { TrainerName.SgdCalibratedOva, typeof(SgdCalibratedOvaExtension) }, + { TrainerName.SymbolicSgdLogisticRegressionBinary, typeof(SymbolicSgdLogisticRegressionBinaryExtension) }, + { TrainerName.SymbolicSgdLogisticRegressionOva, typeof(SymbolicSgdLogisticRegressionOvaExtension) } + }; + + private static readonly IDictionary _extensionTypesToTrainerNames = + _trainerNamesToExtensionTypes.ToDictionary(kvp => kvp.Value, kvp => kvp.Key); + + public static TrainerName GetTrainerName(ITrainerExtension trainerExtension) + { + return _extensionTypesToTrainerNames[trainerExtension.GetType()]; + } + + public static ITrainerExtension GetTrainerExtension(TrainerName trainerName) + { + var trainerExtensionType = _trainerNamesToExtensionTypes[trainerName]; + return (ITrainerExtension)Activator.CreateInstance(trainerExtensionType); + } + + public static IEnumerable GetTrainers(TaskKind task, + IEnumerable whitelist) + { + IEnumerable trainers; + if (task == TaskKind.BinaryClassification) + { + trainers = GetBinaryLearners(); + } + else if (task == TaskKind.MulticlassClassification) + { + trainers = GetMultiLearners(); + } + else if (task == TaskKind.Regression) + { + trainers = GetRegressionLearners(); + } + else + { + // should not be possible to reach here + throw new NotSupportedException($"unsupported machine learning task type {task}"); + } + + if (whitelist != null) + { + whitelist = new HashSet(whitelist); + trainers = trainers.Where(t => whitelist.Contains(GetTrainerName(t))); + } + + return trainers; + } + + private static IEnumerable GetBinaryLearners() + { + return new ITrainerExtension[] + { + new AveragedPerceptronBinaryExtension(), + new SdcaLogisticRegressionBinaryExtension(), + new LightGbmBinaryExtension(), + new SymbolicSgdLogisticRegressionBinaryExtension(), + new LinearSvmBinaryExtension(), + new FastTreeBinaryExtension(), + new LbfgsLogisticRegressionBinaryExtension(), + new FastForestBinaryExtension(), + new SgdCalibratedBinaryExtension() + }; + } + + private static IEnumerable GetMultiLearners() + { + return new ITrainerExtension[] + { + new AveragedPerceptronOvaExtension(), + new SdcaMaximumEntropyMultiExtension(), + new LightGbmMultiExtension(), + new SymbolicSgdLogisticRegressionOvaExtension(), + new FastTreeOvaExtension(), + new LinearSvmOvaExtension(), + new LbfgsLogisticRegressionOvaExtension(), + new SgdCalibratedOvaExtension(), + new FastForestOvaExtension(), + new LbfgsMaximumEntropyMultiExtension() + }; + } + + private static IEnumerable GetRegressionLearners() + { + return new ITrainerExtension[] + { + new SdcaRegressionExtension(), + new LightGbmRegressionExtension(), + new FastTreeRegressionExtension(), + new FastTreeTweedieRegressionExtension(), + new FastForestRegressionExtension(), + new LbfgsPoissonRegressionExtension(), + new OnlineGradientDescentRegressionExtension(), + new OlsRegressionExtension(), + }; + } + } +} diff --git a/src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionUtil.cs b/src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionUtil.cs new file mode 100644 index 0000000000..0d357b20e4 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TrainerExtensions/TrainerExtensionUtil.cs @@ -0,0 +1,382 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Reflection; +using Microsoft.ML.Calibrators; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.LightGbm; + +namespace Microsoft.ML.AutoML +{ + internal enum TrainerName + { + AveragedPerceptronBinary, + AveragedPerceptronOva, + FastForestBinary, + FastForestOva, + FastForestRegression, + FastTreeBinary, + FastTreeOva, + FastTreeRegression, + FastTreeTweedieRegression, + LightGbmBinary, + LightGbmMulti, + LightGbmRegression, + LinearSvmBinary, + LinearSvmOva, + LbfgsLogisticRegressionBinary, + LbfgsLogisticRegressionOva, + LbfgsMaximumEntropyMulti, + OnlineGradientDescentRegression, + OlsRegression, + Ova, + LbfgsPoissonRegression, + SdcaLogisticRegressionBinary, + SdcaMaximumEntropyMulti, + SdcaRegression, + SgdCalibratedBinary, + SgdCalibratedOva, + SymbolicSgdLogisticRegressionBinary, + SymbolicSgdLogisticRegressionOva + } + + internal static class TrainerExtensionUtil + { + private const string WeightColumn = "ExampleWeightColumnName"; + private const string LabelColumn = "LabelColumnName"; + + public static T CreateOptions(IEnumerable sweepParams, string labelColumn) where T : TrainerInputBaseWithLabel + { + var options = Activator.CreateInstance(); + options.LabelColumnName = labelColumn; + if (sweepParams != null) + { + UpdateFields(options, sweepParams); + } + return options; + } + + private static string[] _lightGbmBoosterParamNames = new[] { "L2Regularization", "L1Regularization" }; + private const string LightGbmBoosterPropName = "Booster"; + + public static TOptions CreateLightGbmOptions(IEnumerable sweepParams, ColumnInformation columnInfo) + where TOptions : LightGbmTrainerBase.OptionsBase, new() + where TTransformer : ISingleFeaturePredictionTransformer + where TModel : class + { + var options = new TOptions(); + options.LabelColumnName = columnInfo.LabelColumnName; + options.ExampleWeightColumnName = columnInfo.ExampleWeightColumnName; + options.Booster = new GradientBooster.Options(); + if (sweepParams != null) + { + var boosterParams = sweepParams.Where(p => _lightGbmBoosterParamNames.Contains(p.Name)); + var parentArgParams = sweepParams.Except(boosterParams); + UpdateFields(options, parentArgParams); + UpdateFields(options.Booster, boosterParams); + } + return options; + } + + public static PipelineNode BuildOvaPipelineNode(ITrainerExtension multiExtension, ITrainerExtension binaryExtension, + IEnumerable sweepParams, ColumnInformation columnInfo) + { + var ovaNode = new PipelineNode() + { + Name = TrainerName.Ova.ToString(), + NodeType = PipelineNodeType.Trainer, + Properties = new Dictionary() + { + { LabelColumn, columnInfo.LabelColumnName } + } + }; + var binaryNode = binaryExtension.CreatePipelineNode(sweepParams, columnInfo); + ovaNode.Properties["BinaryTrainer"] = binaryNode; + return ovaNode; + } + + public static PipelineNode BuildPipelineNode(TrainerName trainerName, IEnumerable sweepParams, + string labelColumn, string weightColumn = null, IDictionary additionalProperties = null) + { + var properties = BuildBasePipelineNodeProps(sweepParams, labelColumn, weightColumn); + + if (additionalProperties != null) + { + foreach (var property in additionalProperties) + { + properties[property.Key] = property.Value; + } + } + + return new PipelineNode(trainerName.ToString(), PipelineNodeType.Trainer, DefaultColumnNames.Features, + DefaultColumnNames.Score, properties); + } + + public static PipelineNode BuildLightGbmPipelineNode(TrainerName trainerName, IEnumerable sweepParams, + string labelColumn, string weightColumn) + { + return new PipelineNode(trainerName.ToString(), PipelineNodeType.Trainer, DefaultColumnNames.Features, + DefaultColumnNames.Score, BuildLightGbmPipelineNodeProps(sweepParams, labelColumn, weightColumn)); + } + + private static IDictionary BuildBasePipelineNodeProps(IEnumerable sweepParams, + string labelColumn, string weightColumn) + { + var props = new Dictionary(); + if (sweepParams != null) + { + foreach (var sweepParam in sweepParams) + { + props[sweepParam.Name] = sweepParam.ProcessedValue(); + } + } + props[LabelColumn] = labelColumn; + if (weightColumn != null) + { + props[WeightColumn] = weightColumn; + } + return props; + } + + private static IDictionary BuildLightGbmPipelineNodeProps(IEnumerable sweepParams, + string labelColumn, string weightColumn) + { + Dictionary props = null; + if (sweepParams == null || !sweepParams.Any()) + { + props = new Dictionary(); + } + else + { + var boosterParams = sweepParams.Where(p => _lightGbmBoosterParamNames.Contains(p.Name)); + var parentArgParams = sweepParams.Except(boosterParams); + + var boosterProps = boosterParams.ToDictionary(p => p.Name, p => (object)p.ProcessedValue()); + var boosterCustomProp = new CustomProperty("GradientBooster.Options", boosterProps); + + props = parentArgParams.ToDictionary(p => p.Name, p => (object)p.ProcessedValue()); + props[LightGbmBoosterPropName] = boosterCustomProp; + } + + props[LabelColumn] = labelColumn; + if (weightColumn != null) + { + props[WeightColumn] = weightColumn; + } + + return props; + } + + public static ParameterSet BuildParameterSet(TrainerName trainerName, IDictionary props) + { + props = props.Where(p => p.Key != LabelColumn && p.Key != WeightColumn) + .ToDictionary(kvp => kvp.Key, kvp => kvp.Value); + + if (trainerName == TrainerName.LightGbmBinary || trainerName == TrainerName.LightGbmMulti || + trainerName == TrainerName.LightGbmRegression) + { + return BuildLightGbmParameterSet(props); + } + + var paramVals = props.Select(p => new StringParameterValue(p.Key, p.Value.ToString())); + return new ParameterSet(paramVals); + } + + public static ColumnInformation BuildColumnInfo(IDictionary props) + { + var columnInfo = new ColumnInformation(); + + columnInfo.LabelColumnName = props[LabelColumn] as string; + + props.TryGetValue(WeightColumn, out var weightColumn); + columnInfo.ExampleWeightColumnName = weightColumn as string; + + return columnInfo; + } + + private static ParameterSet BuildLightGbmParameterSet(IDictionary props) + { + IEnumerable parameters; + if (props == null || !props.Any()) + { + parameters = new List(); + } + else + { + var parentProps = props.Where(p => p.Key != LightGbmBoosterPropName); + var treeProps = ((CustomProperty)props[LightGbmBoosterPropName]).Properties; + var allProps = parentProps.Union(treeProps); + parameters = allProps.Select(p => new StringParameterValue(p.Key, p.Value.ToString())); + } + return new ParameterSet(parameters); + } + + private static void SetValue(FieldInfo fi, IComparable value, object obj, Type propertyType) + { + if (propertyType == value?.GetType()) + fi.SetValue(obj, value); + else if (propertyType == typeof(double) && value is float) + fi.SetValue(obj, Convert.ToDouble(value)); + else if (propertyType == typeof(int) && value is long) + fi.SetValue(obj, Convert.ToInt32(value)); + else if (propertyType == typeof(long) && value is int) + fi.SetValue(obj, Convert.ToInt64(value)); + } + + /// + /// Updates properties of object instance based on the values in sweepParams + /// + public static void UpdateFields(object obj, IEnumerable sweepParams) + { + foreach (var param in sweepParams) + { + try + { + // Only updates property if param.value isn't null and + // param has a name of property. + if (param.RawValue == null) + { + continue; + } + var fi = obj.GetType().GetField(param.Name); + var propType = Nullable.GetUnderlyingType(fi.FieldType) ?? fi.FieldType; + + if (param is SweepableDiscreteParam dp) + { + var optIndex = (int)dp.RawValue; + //Contracts.Assert(0 <= optIndex && optIndex < dp.Options.Length, $"Options index out of range: {optIndex}"); + var option = dp.Options[optIndex].ToString().ToLower(); + + // Handle string values in sweep params + if (option == "auto" || option == "" || option == "< auto >") + { + //Check if nullable type, in which case 'null' is the auto value. + if (Nullable.GetUnderlyingType(fi.FieldType) != null) + fi.SetValue(obj, null); + else if (fi.FieldType.IsEnum) + { + // Check if there is an enum option named Auto + var enumDict = fi.FieldType.GetEnumValues().Cast() + .ToDictionary(v => Enum.GetName(fi.FieldType, v), v => v); + if (enumDict.ContainsKey("Auto")) + fi.SetValue(obj, enumDict["Auto"]); + } + } + else + SetValue(fi, (IComparable)dp.Options[optIndex], obj, propType); + } + else + SetValue(fi, param.RawValue, obj, propType); + } + catch (Exception) + { + throw new InvalidOperationException($"Cannot set parameter {param.Name} for {obj.GetType()}"); + } + } + } + + public static TrainerName GetTrainerName(BinaryClassificationTrainer binaryTrainer) + { + switch (binaryTrainer) + { + case BinaryClassificationTrainer.AveragedPerceptron: + return TrainerName.AveragedPerceptronBinary; + case BinaryClassificationTrainer.FastForest: + return TrainerName.FastForestBinary; + case BinaryClassificationTrainer.FastTree: + return TrainerName.FastTreeBinary; + case BinaryClassificationTrainer.LightGbm: + return TrainerName.LightGbmBinary; + case BinaryClassificationTrainer.LinearSvm: + return TrainerName.LinearSvmBinary; + case BinaryClassificationTrainer.LbfgsLogisticRegression: + return TrainerName.LbfgsLogisticRegressionBinary; + case BinaryClassificationTrainer.SdcaLogisticRegression: + return TrainerName.SdcaLogisticRegressionBinary; + case BinaryClassificationTrainer.SgdCalibrated: + return TrainerName.SgdCalibratedBinary; + case BinaryClassificationTrainer.SymbolicSgdLogisticRegression: + return TrainerName.SymbolicSgdLogisticRegressionBinary; + } + + // never expected to reach here + throw new NotSupportedException($"{binaryTrainer} not supported"); + } + + public static TrainerName GetTrainerName(MulticlassClassificationTrainer multiTrainer) + { + switch (multiTrainer) + { + case MulticlassClassificationTrainer.AveragedPerceptronOva: + return TrainerName.AveragedPerceptronOva; + case MulticlassClassificationTrainer.FastForestOva: + return TrainerName.FastForestOva; + case MulticlassClassificationTrainer.FastTreeOva: + return TrainerName.FastTreeOva; + case MulticlassClassificationTrainer.LightGbm: + return TrainerName.LightGbmMulti; + case MulticlassClassificationTrainer.LinearSupportVectorMachinesOva: + return TrainerName.LinearSvmOva; + case MulticlassClassificationTrainer.LbfgsMaximumEntropy: + return TrainerName.LbfgsMaximumEntropyMulti; + case MulticlassClassificationTrainer.LbfgsLogisticRegressionOva: + return TrainerName.LbfgsLogisticRegressionOva; + case MulticlassClassificationTrainer.SdcaMaximumEntropy: + return TrainerName.SdcaMaximumEntropyMulti; + case MulticlassClassificationTrainer.SgdCalibratedOva: + return TrainerName.SgdCalibratedOva; + case MulticlassClassificationTrainer.SymbolicSgdLogisticRegressionOva: + return TrainerName.SymbolicSgdLogisticRegressionOva; + } + + // never expected to reach here + throw new NotSupportedException($"{multiTrainer} not supported"); + } + + public static TrainerName GetTrainerName(RegressionTrainer regressionTrainer) + { + switch (regressionTrainer) + { + case RegressionTrainer.FastForest: + return TrainerName.FastForestRegression; + case RegressionTrainer.FastTree: + return TrainerName.FastTreeRegression; + case RegressionTrainer.FastTreeTweedie: + return TrainerName.FastTreeTweedieRegression; + case RegressionTrainer.LightGbm: + return TrainerName.LightGbmRegression; + case RegressionTrainer.OnlineGradientDescent: + return TrainerName.OnlineGradientDescentRegression; + case RegressionTrainer.Ols: + return TrainerName.OlsRegression; + case RegressionTrainer.LbfgsPoissonRegression: + return TrainerName.LbfgsPoissonRegression; + case RegressionTrainer.StochasticDualCoordinateAscent: + return TrainerName.SdcaRegression; + } + + // never expected to reach here + throw new NotSupportedException($"{regressionTrainer} not supported"); + } + + public static IEnumerable GetTrainerNames(IEnumerable binaryTrainers) + { + return binaryTrainers?.Select(t => GetTrainerName(t)); + } + + public static IEnumerable GetTrainerNames(IEnumerable multiTrainers) + { + return multiTrainers?.Select(t => GetTrainerName(t)); + } + + public static IEnumerable GetTrainerNames(IEnumerable regressionTrainers) + { + return regressionTrainers?.Select(t => GetTrainerName(t)); + } + } +} diff --git a/src/Microsoft.ML.AutoML/TransformInference/TransformInference.cs b/src/Microsoft.ML.AutoML/TransformInference/TransformInference.cs new file mode 100644 index 0000000000..fd8e9d3c48 --- /dev/null +++ b/src/Microsoft.ML.AutoML/TransformInference/TransformInference.cs @@ -0,0 +1,419 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class SuggestedTransform + { + public readonly IEstimator Estimator; + public readonly PipelineNode PipelineNode; + + public SuggestedTransform(PipelineNode pipelineNode, IEstimator estimator) + { + PipelineNode = pipelineNode; + Estimator = estimator; + } + + public SuggestedTransform Clone() + { + return new SuggestedTransform(PipelineNode, Estimator); + } + + public override string ToString() + { + var sb = new StringBuilder(); + sb.Append(PipelineNode.Name); + sb.Append("{"); + if (PipelineNode.OutColumns.Length > 1) + { + for (var i = 0; i < PipelineNode.OutColumns.Length; i++) + { + sb.Append($" col={PipelineNode.OutColumns[i]}:{PipelineNode.InColumns[i]}"); + } + } + else + { + sb.Append($" col={PipelineNode.OutColumns[0]}:{string.Join(",", PipelineNode.InColumns)}"); + } + if (PipelineNode.Properties != null) + { + foreach (var property in PipelineNode.Properties) + { + sb.Append($" {property.Key}={property.Value}"); + } + } + sb.Append("}"); + return sb.ToString(); + } + } + + /// + /// Auto-generate set of transforms for the data view, given the purposes of specified columns. + /// + /// The design is the same as for : there's a sequence of 'experts' + /// that each look at all the columns. Every expert may or may not suggest additional transforms. + /// If the expert needs some information about the column (for example, the column values), + /// this information is lazily calculated by the column object, not the expert itself, to allow the reuse + /// of the same information by another expert. + /// + internal static class TransformInference + { + internal class IntermediateColumn + { + public readonly string ColumnName; + public readonly DataViewType Type; + public readonly ColumnPurpose Purpose; + public readonly ColumnDimensions Dimensions; + + public IntermediateColumn(string name, DataViewType type, ColumnPurpose purpose, ColumnDimensions dimensions) + { + ColumnName = name; + Type = type; + Purpose = purpose; + Dimensions = dimensions; + } + } + + internal sealed class ColumnRoutingStructure : IEquatable + { + public struct AnnotatedName + { + public string Name { get; set; } + public bool IsNumeric { get; set; } + + public bool Equals(AnnotatedName an) + { + return an.Name == Name && + an.IsNumeric == IsNumeric; + } + + public override string ToString() => $"{Name}({IsNumeric})"; + } + + public AnnotatedName[] ColumnsConsumed { get; } + public AnnotatedName[] ColumnsProduced { get; } + + public ColumnRoutingStructure(AnnotatedName[] columnsConsumed, AnnotatedName[] columnsProduced) + { + ColumnsConsumed = columnsConsumed; + ColumnsProduced = columnsProduced; + } + + public bool Equals(ColumnRoutingStructure obj) + { + return obj != null && + obj.ColumnsConsumed.All(cc => ColumnsConsumed.Any(cc.Equals)) && + obj.ColumnsProduced.All(cp => ColumnsProduced.Any(cp.Equals)); + } + } + + internal interface ITransformInferenceExpert + { + IEnumerable Apply(IntermediateColumn[] columns, TaskKind task); + } + + public abstract class TransformInferenceExpertBase : ITransformInferenceExpert + { + public abstract IEnumerable Apply(IntermediateColumn[] columns, TaskKind task); + + protected readonly MLContext Context; + + public TransformInferenceExpertBase(MLContext context) + { + Context = context; + } + } + + private static IEnumerable GetExperts(MLContext context) + { + // The expert work independently of each other, the sequence is irrelevant + // (it only determines the sequence of resulting transforms). + + // For multiclass tasks, convert label column to key + yield return new Experts.Label(context); + + // For boolean columns use convert transform + yield return new Experts.Boolean(context); + + // For categorical columns, use Cat transform. + yield return new Experts.Categorical(context); + + // For text columns, use TextTransform. + yield return new Experts.Text(context); + + // If numeric column has missing values, use Missing transform. + yield return new Experts.NumericMissing(context); + } + + internal static class Experts + { + internal sealed class Label : TransformInferenceExpertBase + { + public Label(MLContext context) : base(context) + { + } + + public override IEnumerable Apply(IntermediateColumn[] columns, TaskKind task) + { + if (task != TaskKind.MulticlassClassification) + { + yield break; + } + + var lastLabelColId = Array.FindLastIndex(columns, x => x.Purpose == ColumnPurpose.Label); + if (lastLabelColId < 0) + yield break; + + var col = columns[lastLabelColId]; + + if (!col.Type.IsKey()) + { + yield return ValueToKeyMappingExtension.CreateSuggestedTransform(Context, col.ColumnName, col.ColumnName); + } + } + } + + internal sealed class Categorical : TransformInferenceExpertBase + { + public Categorical(MLContext context) : base(context) + { + } + + public override IEnumerable Apply(IntermediateColumn[] columns, TaskKind task) + { + bool foundCat = false; + bool foundCatHash = false; + var catColumnsNew = new List(); + var catHashColumnsNew = new List(); + + foreach (var column in columns) + { + if (column.Purpose != ColumnPurpose.CategoricalFeature) + { + continue; + } + + if (column.Dimensions.Cardinality != null && column.Dimensions.Cardinality < 100) + { + foundCat = true; + catColumnsNew.Add(column.ColumnName); + } + else + { + foundCatHash = true; + catHashColumnsNew.Add(column.ColumnName); + } + } + + if (foundCat) + { + var catColumnsArr = catColumnsNew.ToArray(); + yield return OneHotEncodingExtension.CreateSuggestedTransform(Context, catColumnsArr, catColumnsArr); + } + + if (foundCatHash) + { + var catHashColumnsNewArr = catHashColumnsNew.ToArray(); + yield return OneHotHashEncodingExtension.CreateSuggestedTransform(Context, catHashColumnsNewArr, catHashColumnsNewArr); + } + + var transformedColumns = new List(); + transformedColumns.AddRange(catColumnsNew); + transformedColumns.AddRange(catHashColumnsNew); + } + } + + internal sealed class Boolean : TransformInferenceExpertBase + { + public Boolean(MLContext context) : base(context) + { + } + + public override IEnumerable Apply(IntermediateColumn[] columns, TaskKind task) + { + var newColumns = new List(); + + foreach (var column in columns) + { + if (!column.Type.GetItemType().IsBool() || column.Purpose != ColumnPurpose.NumericFeature) + { + continue; + } + + newColumns.Add(column.ColumnName); + } + + if (newColumns.Count() > 0) + { + var newColumnsArr = newColumns.ToArray(); + yield return TypeConvertingExtension.CreateSuggestedTransform(Context, newColumnsArr, newColumnsArr); + } + } + } + + internal sealed class Text : TransformInferenceExpertBase + { + public Text(MLContext context) : base(context) + { + } + + public override IEnumerable Apply(IntermediateColumn[] columns, TaskKind task) + { + var featureCols = new List(); + + foreach (var column in columns) + { + if (!column.Type.GetItemType().IsText() || column.Purpose != ColumnPurpose.TextFeature) + continue; + + var columnDestSuffix = "_tf"; + var columnNameSafe = column.ColumnName; + + string columnDestRenamed = $"{columnNameSafe}{columnDestSuffix}"; + + featureCols.Add(columnDestRenamed); + yield return TextFeaturizingExtension.CreateSuggestedTransform(Context, columnNameSafe, columnDestRenamed); + } + } + } + + internal sealed class NumericMissing : TransformInferenceExpertBase + { + public NumericMissing(MLContext context) : base(context) + { + } + + public override IEnumerable Apply(IntermediateColumn[] columns, TaskKind task) + { + var columnsWithMissing = new List(); + foreach (var column in columns) + { + if (column.Type.GetItemType() == NumberDataViewType.Single + && column.Purpose == ColumnPurpose.NumericFeature + && column.Dimensions.HasMissing == true) + { + columnsWithMissing.Add(column.ColumnName); + } + } + if (columnsWithMissing.Any()) + { + var columnsArr = columnsWithMissing.ToArray(); + var indicatorColNames = GetNewColumnNames(columnsArr.Select(c => $"{c}_MissingIndicator"), columns).ToArray(); + yield return MissingValueIndicatingExtension.CreateSuggestedTransform(Context, columnsArr, indicatorColNames); + yield return TypeConvertingExtension.CreateSuggestedTransform(Context, indicatorColNames, indicatorColNames); + yield return MissingValueReplacingExtension.CreateSuggestedTransform(Context, columnsArr, columnsArr); + } + } + } + } + + /// + /// Automatically infer transforms for the data view + /// + public static SuggestedTransform[] InferTransforms(MLContext context, TaskKind task, DatasetColumnInfo[] columns) + { + var intermediateCols = columns.Where(c => c.Purpose != ColumnPurpose.Ignore) + .Select(c => new IntermediateColumn(c.Name, c.Type, c.Purpose, c.Dimensions)) + .ToArray(); + + var suggestedTransforms = new List(); + foreach (var expert in GetExperts(context)) + { + SuggestedTransform[] suggestions = expert.Apply(intermediateCols, task).ToArray(); + suggestedTransforms.AddRange(suggestions); + } + + var finalFeaturesConcatTransform = BuildFinalFeaturesConcatTransform(context, suggestedTransforms, intermediateCols); + if (finalFeaturesConcatTransform != null) + { + suggestedTransforms.Add(finalFeaturesConcatTransform); + } + + return suggestedTransforms.ToArray(); + } + + /// + /// Build final features concat transform, using output of all suggested experts. + /// Take the output columns from all suggested experts (except for 'Label'), and concatenate them + /// into one final 'Features' column that a trainer will accept. + /// + private static SuggestedTransform BuildFinalFeaturesConcatTransform(MLContext context, IEnumerable suggestedTransforms, + IEnumerable intermediateCols) + { + // get the output column names from all suggested transforms + var concatColNames = new List(); + foreach (var suggestedTransform in suggestedTransforms) + { + concatColNames.AddRange(suggestedTransform.PipelineNode.OutColumns); + } + + // include all numeric columns of type R4 + foreach(var intermediateCol in intermediateCols) + { + if (intermediateCol.Purpose == ColumnPurpose.NumericFeature && + intermediateCol.Type.GetItemType() == NumberDataViewType.Single) + { + concatColNames.Add(intermediateCol.ColumnName); + } + } + + // remove column with 'Label' purpose + var labelColumnName = intermediateCols.FirstOrDefault(c => c.Purpose == ColumnPurpose.Label)?.ColumnName; + concatColNames.Remove(labelColumnName); + + intermediateCols = intermediateCols.Where(c => c.Purpose == ColumnPurpose.NumericFeature || + c.Purpose == ColumnPurpose.CategoricalFeature || c.Purpose == ColumnPurpose.TextFeature); + + if (!concatColNames.Any() || (concatColNames.Count == 1 && + concatColNames[0] == DefaultColumnNames.Features && + intermediateCols.First().Type.IsVector())) + { + return null; + } + + if (concatColNames.Count() == 1 && + (intermediateCols.First().Type.IsVector() || + intermediateCols.First().Purpose == ColumnPurpose.CategoricalFeature || + intermediateCols.First().Purpose == ColumnPurpose.TextFeature)) + { + return ColumnCopyingExtension.CreateSuggestedTransform(context, concatColNames.First(), DefaultColumnNames.Features); + } + + return ColumnConcatenatingExtension.CreateSuggestedTransform(context, concatColNames.Distinct().ToArray(), DefaultColumnNames.Features); + } + + private static IEnumerable GetNewColumnNames(IEnumerable desiredColNames, IEnumerable columns) + { + var newColNames = new List(); + + var existingColNames = new HashSet(columns.Select(c => c.ColumnName)); + foreach (var desiredColName in desiredColNames) + { + if (!existingColNames.Contains(desiredColName)) + { + newColNames.Add(desiredColName); + continue; + } + + for(var i = 0; ; i++) + { + var newColName = $"{desiredColName}{i}"; + if (!existingColNames.Contains(newColName)) + { + newColNames.Add(newColName); + break; + } + } + } + + return newColNames; + } + } +} diff --git a/src/Microsoft.ML.AutoML/TransformInference/TransformInferenceApi.cs b/src/Microsoft.ML.AutoML/TransformInference/TransformInferenceApi.cs new file mode 100644 index 0000000000..4843f6ef7e --- /dev/null +++ b/src/Microsoft.ML.AutoML/TransformInference/TransformInferenceApi.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class TransformInferenceApi + { + public static IEnumerable InferTransforms(MLContext context, TaskKind task, DatasetColumnInfo[] columns) + { + return TransformInference.InferTransforms(context, task, columns); + } + + public static IEnumerable InferTransformsPostTrainer(MLContext context, TaskKind task, DatasetColumnInfo[] columns) + { + return TransformPostTrainerInference.InferTransforms(context, task, columns); + } + } +} diff --git a/src/Microsoft.ML.AutoML/TransformInference/TransformPostTrainerInference.cs b/src/Microsoft.ML.AutoML/TransformInference/TransformPostTrainerInference.cs new file mode 100644 index 0000000000..ca98152e6d --- /dev/null +++ b/src/Microsoft.ML.AutoML/TransformInference/TransformPostTrainerInference.cs @@ -0,0 +1,43 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class TransformPostTrainerInference + { + public static IEnumerable InferTransforms(MLContext context, TaskKind task, DatasetColumnInfo[] columns) + { + var suggestedTransforms = new List(); + suggestedTransforms.AddRange(InferLabelTransforms(context, task, columns)); + return suggestedTransforms; + } + + private static IEnumerable InferLabelTransforms(MLContext context, TaskKind task, + DatasetColumnInfo[] columns) + { + var inferredTransforms = new List(); + + if (task != TaskKind.MulticlassClassification) + { + return inferredTransforms; + } + + // If label column type wasn't originally key type, + // convert predicted label column back from key to value. + // (Non-key label column was converted to key, b/c multiclass trainers only + // accept label columns that are key type) + var labelColumn = columns.First(c => c.Purpose == ColumnPurpose.Label); + if (!labelColumn.Type.IsKey()) + { + inferredTransforms.Add(KeyToValueMappingExtension.CreateSuggestedTransform(context, DefaultColumnNames.PredictedLabel, DefaultColumnNames.PredictedLabel)); + } + + return inferredTransforms; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Utils/BestResultUtil.cs b/src/Microsoft.ML.AutoML/Utils/BestResultUtil.cs new file mode 100644 index 0000000000..05cba5e8a7 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/BestResultUtil.cs @@ -0,0 +1,114 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class BestResultUtil + { + public static RunDetail GetBestRun(IEnumerable> results, + BinaryClassificationMetric metric) + { + var metricsAgent = new BinaryMetricsAgent(null, metric); + var metricInfo = new OptimizingMetricInfo(metric); + return GetBestRun(results, metricsAgent, metricInfo.IsMaximizing); + } + + public static RunDetail GetBestRun(IEnumerable> results, + RegressionMetric metric) + { + var metricsAgent = new RegressionMetricsAgent(null, metric); + var metricInfo = new OptimizingMetricInfo(metric); + return GetBestRun(results, metricsAgent, metricInfo.IsMaximizing); + } + + public static RunDetail GetBestRun(IEnumerable> results, + MulticlassClassificationMetric metric) + { + var metricsAgent = new MultiMetricsAgent(null, metric); + var metricInfo = new OptimizingMetricInfo(metric); + return GetBestRun(results, metricsAgent, metricInfo.IsMaximizing); + } + + public static RunDetail GetBestRun(IEnumerable> results, + IMetricsAgent metricsAgent, bool isMetricMaximizing) + { + results = results.Where(r => r.ValidationMetrics != null); + if (!results.Any()) { return null; } + var scores = results.Select(r => metricsAgent.GetScore(r.ValidationMetrics)); + var indexOfBestScore = GetIndexOfBestScore(scores, isMetricMaximizing); + return results.ElementAt(indexOfBestScore); + } + + public static CrossValidationRunDetail GetBestRun(IEnumerable> results, + IMetricsAgent metricsAgent, bool isMetricMaximizing) + { + results = results.Where(r => r.Results != null && r.Results.Any(x => x.ValidationMetrics != null)); + if (!results.Any()) { return null; } + var scores = results.Select(r => r.Results.Average(x => metricsAgent.GetScore(x.ValidationMetrics))); + var indexOfBestScore = GetIndexOfBestScore(scores, isMetricMaximizing); + return results.ElementAt(indexOfBestScore); + } + + public static IEnumerable<(RunDetail, int)> GetTopNRunResults(IEnumerable> results, + IMetricsAgent metricsAgent, int n, bool isMetricMaximizing) + { + results = results.Where(r => r.ValidationMetrics != null); + if (!results.Any()) { return null; } + + var indexedValues = results.Select((k, v) => (k, v)); + + IEnumerable<(RunDetail, int)> orderedResults; + if (isMetricMaximizing) + { + orderedResults = indexedValues.OrderByDescending(t => metricsAgent.GetScore(t.Item1.ValidationMetrics)); + + } + else + { + orderedResults = indexedValues.OrderBy(t => metricsAgent.GetScore(t.Item1.ValidationMetrics)); + } + + return orderedResults.Take(n); + } + + public static int GetIndexOfBestScore(IEnumerable scores, bool isMetricMaximizing) + { + return isMetricMaximizing ? GetIndexOfMaxScore(scores) : GetIndexOfMinScore(scores); + } + + private static int GetIndexOfMinScore(IEnumerable scores) + { + var minScore = double.PositiveInfinity; + var minIndex = -1; + for (var i = 0; i < scores.Count(); i++) + { + if (scores.ElementAt(i) < minScore) + { + minScore = scores.ElementAt(i); + minIndex = i; + } + } + return minIndex; + } + + private static int GetIndexOfMaxScore(IEnumerable scores) + { + var maxScore = double.NegativeInfinity; + var maxIndex = -1; + for (var i = 0; i < scores.Count(); i++) + { + if (scores.ElementAt(i) > maxScore) + { + maxScore = scores.ElementAt(i); + maxIndex = i; + } + } + return maxIndex; + } + } +} diff --git a/src/Microsoft.ML.AutoML/Utils/ColumnTypeExtensions.cs b/src/Microsoft.ML.AutoML/Utils/ColumnTypeExtensions.cs new file mode 100644 index 0000000000..f05419d6d9 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/ColumnTypeExtensions.cs @@ -0,0 +1,36 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class DataViewTypeExtensions + { + public static bool IsNumber(this DataViewType columnType) + { + return columnType is NumberDataViewType; + } + + public static bool IsText(this DataViewType columnType) + { + return columnType is TextDataViewType; + } + + public static bool IsBool(this DataViewType columnType) + { + return columnType is BooleanDataViewType; + } + + public static bool IsVector(this DataViewType columnType) + { + return columnType is VectorDataViewType; + } + + public static bool IsKey(this DataViewType columnType) + { + return columnType is KeyDataViewType; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Utils/DatasetColumnInfo.cs b/src/Microsoft.ML.AutoML/Utils/DatasetColumnInfo.cs new file mode 100644 index 0000000000..f43cb5828d --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/DatasetColumnInfo.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal class DatasetColumnInfo + { + public readonly string Name; + public readonly DataViewType Type; + public readonly ColumnPurpose Purpose; + public readonly ColumnDimensions Dimensions; + + public DatasetColumnInfo(string name, DataViewType type, ColumnPurpose purpose, ColumnDimensions dimensions) + { + Name = name; + Type = type; + Purpose = purpose; + Dimensions = dimensions; + } + } + + internal static class DatasetColumnInfoUtil + { + public static DatasetColumnInfo[] GetDatasetColumnInfo(MLContext context, IDataView data, ColumnInformation columnInfo) + { + var purposes = PurposeInference.InferPurposes(context, data, columnInfo); + var colDimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, purposes); + var cols = new DatasetColumnInfo[data.Schema.Count]; + for (var i = 0; i < cols.Length; i++) + { + var schemaCol = data.Schema[i]; + var col = new DatasetColumnInfo(schemaCol.Name, schemaCol.Type, purposes[i].Purpose, colDimensions[i]); + cols[i] = col; + } + return cols; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Utils/Logger.cs b/src/Microsoft.ML.AutoML/Utils/Logger.cs new file mode 100644 index 0000000000..749c5b7cc6 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/Logger.cs @@ -0,0 +1,30 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime; + +namespace Microsoft.ML.AutoML +{ + internal class AutoMLLogger + { + public const string ChannelName = "AutoML"; + + private readonly IChannel _channel; + + public AutoMLLogger(MLContext context) + { + _channel = ((IChannelProvider)context).Start(ChannelName); + } + + public void Trace(string message) + { + _channel.Trace(MessageSensitivity.None, message); + } + + public void Error(string message) + { + _channel.Error(MessageSensitivity.None, message); + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.AutoML/Utils/SplitUtil.cs b/src/Microsoft.ML.AutoML/Utils/SplitUtil.cs new file mode 100644 index 0000000000..aa98cfa0a8 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/SplitUtil.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML +{ + internal static class SplitUtil + { + public static (IDataView[] trainDatasets, IDataView[] validationDatasets) CrossValSplit(MLContext context, + IDataView trainData, uint numFolds, string samplingKeyColumn) + { + var originalColumnNames = trainData.Schema.Select(c => c.Name); + var splits = context.Data.CrossValidationSplit(trainData, (int)numFolds, samplingKeyColumnName: samplingKeyColumn); + var trainDatasets = new List(); + var validationDatasets = new List(); + + foreach (var split in splits) + { + if (DatasetDimensionsUtil.IsDataViewEmpty(split.TrainSet) || + DatasetDimensionsUtil.IsDataViewEmpty(split.TestSet)) + { + continue; + } + + var trainDataset = DropAllColumnsExcept(context, split.TrainSet, originalColumnNames); + var validationDataset = DropAllColumnsExcept(context, split.TestSet, originalColumnNames); + + trainDatasets.Add(trainDataset); + validationDatasets.Add(validationDataset); + } + + if (!trainDatasets.Any()) + { + throw new InvalidOperationException("All cross validation folds have empty train or test data. " + + "Try increasing the number of rows provided in training data, or lowering specified number of " + + "cross validation folds."); + } + + return (trainDatasets.ToArray(), validationDatasets.ToArray()); + } + + /// + /// Split the data into a single train/test split. + /// + public static (IDataView trainData, IDataView validationData) TrainValidateSplit(MLContext context, IDataView trainData, + string samplingKeyColumn) + { + var originalColumnNames = trainData.Schema.Select(c => c.Name); + var splitData = context.Data.TrainTestSplit(trainData, samplingKeyColumnName: samplingKeyColumn); + trainData = DropAllColumnsExcept(context, splitData.TrainSet, originalColumnNames); + var validationData = DropAllColumnsExcept(context, splitData.TestSet, originalColumnNames); + return (trainData, validationData); + } + + private static IDataView DropAllColumnsExcept(MLContext context, IDataView data, IEnumerable columnsToKeep) + { + var allColumns = data.Schema.Select(c => c.Name); + var columnsToDrop = allColumns.Except(columnsToKeep); + if (!columnsToDrop.Any()) + { + return data; + } + return context.Transforms.DropColumns(columnsToDrop.ToArray()).Fit(data).Transform(data); + } + } +} diff --git a/src/Microsoft.ML.AutoML/Utils/SweepableParamAttributes.cs b/src/Microsoft.ML.AutoML/Utils/SweepableParamAttributes.cs new file mode 100644 index 0000000000..ae7945d21e --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/SweepableParamAttributes.cs @@ -0,0 +1,214 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Globalization; +using System.Linq; +using System.Text; + +namespace Microsoft.ML.AutoML +{ + /// + /// Used to indicate suggested sweep ranges for parameter sweeping. + /// + internal abstract class SweepableParam + { + public string Name { get; set; } + private IComparable _rawValue; + public virtual IComparable RawValue + { + get => _rawValue; + set + { + if (!Frozen) + _rawValue = value; + } + } + + // The raw value will store an index for discrete parameters, + // but sometimes we want the text or numeric value itself, + // not the hot index. The processed value does that for discrete + // params. For other params, it just returns the raw value itself. + public virtual IComparable ProcessedValue() => _rawValue; + + // Allows for hyperparameter value freezing, so that sweeps + // will not alter the current value when true. + public bool Frozen { get; set; } + + // Allows the sweepable param to be set directly using the + // available ValueText attribute on IParameterValues (from + // the ParameterSets used in the old hyperparameter sweepers). + public abstract void SetUsingValueText(string valueText); + + public abstract SweepableParam Clone(); + } + + internal sealed class SweepableDiscreteParam : SweepableParam + { + public object[] Options { get; } + + public SweepableDiscreteParam(string name, object[] values, bool isBool = false) : this(values, isBool) + { + Name = name; + } + + public SweepableDiscreteParam(object[] values, bool isBool = false) + { + Options = isBool ? new object[] { false, true } : values; + } + + public override IComparable RawValue + { + get => base.RawValue; + set + { + var val = Convert.ToInt32(value); + if (!Frozen && 0 <= val && val < Options.Length) + base.RawValue = val; + } + } + + public override void SetUsingValueText(string valueText) + { + for (int i = 0; i < Options.Length; i++) + if (valueText == Options[i].ToString()) + RawValue = i; + } + + private static string TranslateOption(object o) + { + switch (o) + { + case float _: + case double _: + return $"{o}f"; + case long _: + case int _: + case byte _: + case short _: + return o.ToString(); + case bool _: + return o.ToString().ToLower(); + case Enum _: + var type = o.GetType(); + var defaultName = $"Enums.{type.Name}.{o.ToString()}"; + var name = type.FullName?.Replace("+", "."); + if (name == null) + return defaultName; + var index1 = name.LastIndexOf(".", StringComparison.Ordinal); + var index2 = name.Substring(0, index1).LastIndexOf(".", StringComparison.Ordinal) + 1; + if (index2 >= 0) + return $"{name.Substring(index2)}.{o.ToString()}"; + return defaultName; + default: + return $"\"{o}\""; + } + } + + public override SweepableParam Clone() => + new SweepableDiscreteParam(Name, Options) { RawValue = RawValue, Frozen = Frozen }; + + public override string ToString() + { + var name = string.IsNullOrEmpty(Name) ? "" : $"\"{Name}\", "; + return $"[{GetType().Name}({name}new object[]{{{string.Join(", ", Options.Select(TranslateOption))}}})]"; + } + + public override IComparable ProcessedValue() => (IComparable)Options[(int)RawValue]; + } + + internal sealed class SweepableFloatParam : SweepableParam + { + public float Min { get; } + public float Max { get; } + public float? StepSize { get; } + public int? NumSteps { get; } + public bool IsLogScale { get; } + + public SweepableFloatParam(string name, float min, float max, float stepSize = -1, int numSteps = -1, + bool isLogScale = false) : this(min, max, stepSize, numSteps, isLogScale) + { + Name = name; + } + + public SweepableFloatParam(float min, float max, float stepSize = -1, int numSteps = -1, bool isLogScale = false) + { + Min = min; + Max = max; + if (!stepSize.Equals(-1)) + StepSize = stepSize; + if (numSteps != -1) + NumSteps = numSteps; + IsLogScale = isLogScale; + } + + public override void SetUsingValueText(string valueText) + { + RawValue = float.Parse(valueText, CultureInfo.InvariantCulture); + } + + public override SweepableParam Clone() => + new SweepableFloatParam(Name, Min, Max, StepSize ?? -1, NumSteps ?? -1, IsLogScale) { RawValue = RawValue, Frozen = Frozen }; + + public override string ToString() + { + var optional = new StringBuilder(); + if (StepSize != null) + optional.Append($", stepSize:{StepSize}"); + if (NumSteps != null) + optional.Append($", numSteps:{NumSteps}"); + if (IsLogScale) + optional.Append($", isLogScale:true"); + var name = string.IsNullOrEmpty(Name) ? "" : $"\"{Name}\", "; + return $"[{GetType().Name}({name}{Min}f, {Max}f{optional})]"; + } + } + + internal sealed class SweepableLongParam : SweepableParam + { + public long Min { get; } + public long Max { get; } + public float? StepSize { get; } + public int? NumSteps { get; } + public bool IsLogScale { get; } + + public SweepableLongParam(string name, long min, long max, float stepSize = -1, int numSteps = -1, + bool isLogScale = false) : this(min, max, stepSize, numSteps, isLogScale) + { + Name = name; + } + + public SweepableLongParam(long min, long max, float stepSize = -1, int numSteps = -1, bool isLogScale = false) + { + Min = min; + Max = max; + if (!stepSize.Equals(-1)) + StepSize = stepSize; + if (numSteps != -1) + NumSteps = numSteps; + IsLogScale = isLogScale; + } + + public override void SetUsingValueText(string valueText) + { + RawValue = long.Parse(valueText); + } + + public override SweepableParam Clone() => + new SweepableLongParam(Name, Min, Max, StepSize ?? -1, NumSteps ?? -1, IsLogScale) { RawValue = RawValue, Frozen = Frozen }; + + public override string ToString() + { + var optional = new StringBuilder(); + if (StepSize != null) + optional.Append($", stepSize:{StepSize}"); + if (NumSteps != null) + optional.Append($", numSteps:{NumSteps}"); + if (IsLogScale) + optional.Append($", isLogScale:true"); + var name = string.IsNullOrEmpty(Name) ? "" : $"\"{Name}\", "; + return $"[{GetType().Name}({name}{Min}, {Max}{optional})]"; + } + } +} diff --git a/src/Microsoft.ML.AutoML/Utils/UserInputValidationUtil.cs b/src/Microsoft.ML.AutoML/Utils/UserInputValidationUtil.cs new file mode 100644 index 0000000000..dfbecb2634 --- /dev/null +++ b/src/Microsoft.ML.AutoML/Utils/UserInputValidationUtil.cs @@ -0,0 +1,272 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML +{ + internal static class UserInputValidationUtil + { + // column purpose names + private const string LabelColumnPurposeName = "label"; + private const string WeightColumnPurposeName = "weight"; + private const string NumericColumnPurposeName = "numeric"; + private const string CategoricalColumnPurposeName = "categorical"; + private const string TextColumnPurposeName = "text"; + private const string IgnoredColumnPurposeName = "ignored"; + private const string SamplingKeyColumnPurposeName = "sampling key"; + + public static void ValidateExperimentExecuteArgs(IDataView trainData, ColumnInformation columnInformation, + IDataView validationData, TaskKind task) + { + ValidateTrainData(trainData, columnInformation); + ValidateColumnInformation(trainData, columnInformation, task); + ValidateValidationData(trainData, validationData); + } + + public static void ValidateInferColumnsArgs(string path, ColumnInformation columnInformation) + { + ValidateColumnInformation(columnInformation); + ValidatePath(path); + } + + public static void ValidateInferColumnsArgs(string path, string labelColumn) + { + ValidateLabelColumn(labelColumn); + ValidatePath(path); + } + + public static void ValidateInferColumnsArgs(string path) + { + ValidatePath(path); + } + + public static void ValidateNumberOfCVFoldsArg(uint numberOfCVFolds) + { + if (numberOfCVFolds <= 1) + { + throw new ArgumentException($"{nameof(numberOfCVFolds)} must be at least 2", nameof(numberOfCVFolds)); + } + } + + private static void ValidateTrainData(IDataView trainData, ColumnInformation columnInformation) + { + if (trainData == null) + { + throw new ArgumentNullException(nameof(trainData), "Training data cannot be null"); + } + + if (DatasetDimensionsUtil.IsDataViewEmpty(trainData)) + { + throw new ArgumentException("Training data has 0 rows", nameof(trainData)); + } + + foreach (var column in trainData.Schema) + { + if (column.Name == DefaultColumnNames.Features && column.Type.GetItemType() != NumberDataViewType.Single) + { + throw new ArgumentException($"{DefaultColumnNames.Features} column must be of data type {NumberDataViewType.Single}", nameof(trainData)); + } + + if (column.Name != columnInformation.LabelColumnName && + column.Type.GetItemType() != BooleanDataViewType.Instance && + column.Type.GetItemType() != NumberDataViewType.Single && + column.Type.GetItemType() != TextDataViewType.Instance) + { + throw new ArgumentException($"Only supported feature column types are " + + $"{BooleanDataViewType.Instance}, {NumberDataViewType.Single}, and {TextDataViewType.Instance}. " + + $"Please change the feature column {column.Name} of type {column.Type} to one of " + + $"the supported types.", nameof(trainData)); + } + } + } + + private static void ValidateColumnInformation(IDataView trainData, ColumnInformation columnInformation, TaskKind task) + { + ValidateColumnInformation(columnInformation); + ValidateTrainDataColumn(trainData, columnInformation.LabelColumnName, LabelColumnPurposeName, GetAllowedLabelTypes(task)); + ValidateTrainDataColumn(trainData, columnInformation.ExampleWeightColumnName, WeightColumnPurposeName); + ValidateTrainDataColumn(trainData, columnInformation.SamplingKeyColumnName, SamplingKeyColumnPurposeName); + ValidateTrainDataColumns(trainData, columnInformation.CategoricalColumnNames, CategoricalColumnPurposeName, + new DataViewType[] { NumberDataViewType.Single, TextDataViewType.Instance }); + ValidateTrainDataColumns(trainData, columnInformation.NumericColumnNames, NumericColumnPurposeName, + new DataViewType[] { NumberDataViewType.Single, BooleanDataViewType.Instance }); + ValidateTrainDataColumns(trainData, columnInformation.TextColumnNames, TextColumnPurposeName, + new DataViewType[] { TextDataViewType.Instance }); + ValidateTrainDataColumns(trainData, columnInformation.IgnoredColumnNames, IgnoredColumnPurposeName); + } + + private static void ValidateColumnInformation(ColumnInformation columnInformation) + { + ValidateLabelColumn(columnInformation.LabelColumnName); + + ValidateColumnInfoEnumerationProperty(columnInformation.CategoricalColumnNames, CategoricalColumnPurposeName); + ValidateColumnInfoEnumerationProperty(columnInformation.NumericColumnNames, NumericColumnPurposeName); + ValidateColumnInfoEnumerationProperty(columnInformation.TextColumnNames, TextColumnPurposeName); + ValidateColumnInfoEnumerationProperty(columnInformation.IgnoredColumnNames, IgnoredColumnPurposeName); + + // keep a list of all columns, to detect duplicates + var allColumns = new List(); + allColumns.Add(columnInformation.LabelColumnName); + if (columnInformation.ExampleWeightColumnName != null) { allColumns.Add(columnInformation.ExampleWeightColumnName); } + if (columnInformation.CategoricalColumnNames != null) { allColumns.AddRange(columnInformation.CategoricalColumnNames); } + if (columnInformation.NumericColumnNames != null) { allColumns.AddRange(columnInformation.NumericColumnNames); } + if (columnInformation.TextColumnNames != null) { allColumns.AddRange(columnInformation.TextColumnNames); } + if (columnInformation.IgnoredColumnNames != null) { allColumns.AddRange(columnInformation.IgnoredColumnNames); } + + var duplicateColName = FindFirstDuplicate(allColumns); + if (duplicateColName != null) + { + throw new ArgumentException($"Duplicate column name {duplicateColName} is present in two or more distinct properties of provided column information", nameof(columnInformation)); + } + } + + private static void ValidateColumnInfoEnumerationProperty(IEnumerable columns, string columnPurpose) + { + if (columns?.Contains(null) == true) + { + throw new ArgumentException($"Null column string was specified as {columnPurpose} in column information"); + } + } + + private static void ValidateLabelColumn(string labelColumn) + { + if (labelColumn == null) + { + throw new ArgumentException("Provided label column cannot be null"); + } + } + + private static void ValidatePath(string path) + { + if (path == null) + { + throw new ArgumentNullException(nameof(path), "Provided path cannot be null"); + } + + var fileInfo = new FileInfo(path); + + if (!fileInfo.Exists) + { + throw new ArgumentException($"File '{path}' does not exist", nameof(path)); + } + + if (fileInfo.Length == 0) + { + throw new ArgumentException($"File at path '{path}' cannot be empty", nameof(path)); + } + } + + private static void ValidateValidationData(IDataView trainData, IDataView validationData) + { + if (validationData == null) + { + return; + } + + if (DatasetDimensionsUtil.IsDataViewEmpty(validationData)) + { + throw new ArgumentException("Validation data has 0 rows", nameof(validationData)); + } + + const string schemaMismatchError = "Training data and validation data schemas do not match."; + + if (trainData.Schema.Count != validationData.Schema.Count) + { + throw new ArgumentException($"{schemaMismatchError} Train data has '{trainData.Schema.Count}' columns," + + $"and validation data has '{validationData.Schema.Count}' columns.", nameof(validationData)); + } + + foreach (var trainCol in trainData.Schema) + { + var validCol = validationData.Schema.GetColumnOrNull(trainCol.Name); + if (validCol == null) + { + throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' exsits in train data, but not in validation data.", nameof(validationData)); + } + + if (trainCol.Type != validCol.Value.Type) + { + throw new ArgumentException($"{schemaMismatchError} Column '{trainCol.Name}' is of type {trainCol.Type} in train data, and type " + + $"{validCol.Value.Type} in validation data.", nameof(validationData)); + } + } + } + + private static void ValidateTrainDataColumns(IDataView trainData, IEnumerable columnNames, string columnPurpose, + IEnumerable allowedTypes = null) + { + if (columnNames == null) + { + return; + } + + foreach (var columnName in columnNames) + { + ValidateTrainDataColumn(trainData, columnName, columnPurpose, allowedTypes); + } + } + + private static void ValidateTrainDataColumn(IDataView trainData, string columnName, string columnPurpose, IEnumerable allowedTypes = null) + { + if (columnName == null) + { + return; + } + + var nullableColumn = trainData.Schema.GetColumnOrNull(columnName); + if (nullableColumn == null) + { + throw new ArgumentException($"Provided {columnPurpose} column '{columnName}' not found in training data."); + } + + if(allowedTypes == null) + { + return; + } + var column = nullableColumn.Value; + var itemType = column.Type.GetItemType(); + if (!allowedTypes.Contains(itemType)) + { + if (allowedTypes.Count() == 1) + { + throw new ArgumentException($"Provided {columnPurpose} column '{columnName}' was of type {itemType}, " + + $"but only type {allowedTypes.First()} is allowed."); + } + else + { + throw new ArgumentException($"Provided {columnPurpose} column '{columnName}' was of type {itemType}, " + + $"but only types {string.Join(", ", allowedTypes)} are allowed."); + } + } + } + + private static string FindFirstDuplicate(IEnumerable values) + { + var groups = values.GroupBy(v => v); + return groups.FirstOrDefault(g => g.Count() > 1)?.Key; + } + + private static IEnumerable GetAllowedLabelTypes(TaskKind task) + { + switch (task) + { + case TaskKind.BinaryClassification: + return new DataViewType[] { BooleanDataViewType.Instance }; + // Multiclass label types are flexible, as we convert the label to a key type + // (if input label is not already a key) before invoking the trainer. + case TaskKind.MulticlassClassification: + return null; + case TaskKind.Regression: + return new DataViewType[] { NumberDataViewType.Single }; + default: + throw new NotSupportedException($"Unsupported task type: {task}"); + } + } + } +} diff --git a/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs index 6186a37403..79601b4ab3 100644 --- a/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Core/Properties/AssemblyInfo.cs @@ -12,6 +12,7 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Sweeper.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.InferenceTesting" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.OnnxTransformerTest" + PublicKey.TestValue)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.AutoML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.EntryPoints" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Maml" + PublicKey.Value)] diff --git a/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs index 8c5ecaaac4..7297568e10 100644 --- a/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.CpuMath/Properties/AssemblyInfo.cs @@ -11,6 +11,7 @@ [assembly: InternalsVisibleTo(assemblyName: "LibSvmWrapper" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Runtime.NeuralNetworks" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.RServerScoring.NeuralNetworks" + InternalPublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.AutoML" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "RunTests" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "SseTests" + InternalPublicKey.Value)] diff --git a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs index 097a36225a..db1be394a8 100644 --- a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs @@ -13,6 +13,7 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Predictor.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Benchmarks" + PublicKey.TestValue)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.AutoML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Benchmarks" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.EntryPoints" + PublicKey.Value)] @@ -85,5 +86,6 @@ [assembly: InternalsVisibleTo(assemblyName: "DotNetBridge" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "NeuralNetworksTest" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "RunTestsAzurePublish" + InternalPublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.AutoML" + PublicKey.Value)] [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.Dnn/DnnCatalog.cs b/src/Microsoft.ML.Dnn/DnnCatalog.cs index 3f5ee076ff..fea3735796 100644 --- a/src/Microsoft.ML.Dnn/DnnCatalog.cs +++ b/src/Microsoft.ML.Dnn/DnnCatalog.cs @@ -9,11 +9,12 @@ using Microsoft.ML.Data; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Dnn; -using static Microsoft.ML.Transforms.DnnEstimator; +using static Microsoft.ML.Transforms.ImageClassificationEstimator; +using Options = Microsoft.ML.Transforms.DnnRetrainEstimator.Options; namespace Microsoft.ML { - /// + /// public static class DnnCatalog { @@ -36,11 +37,10 @@ public static class DnnCatalog /// Learning rate to use during optimization (Optional). /// Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3]. /// This parameter is used to deal with models that have unknown shape but the internal operators in the model require data to have batch dimension as well. - /// /// /// The support for retraining is under preview. /// - public static DnnEstimator RetrainDnnModel( + public static DnnRetrainEstimator RetrainDnnModel( this ModelOperationsCatalog catalog, string[] outputColumnNames, string[] inputColumnNames, @@ -54,8 +54,7 @@ public static DnnEstimator RetrainDnnModel( string metricOperation = null, string learningRateOperation = null, float learningRate = 0.01f, - bool addBatchDimensionInput = false, - DnnFramework dnnFramework = DnnFramework.Tensorflow) + bool addBatchDimensionInput = false) { var options = new Options() { @@ -71,12 +70,11 @@ public static DnnEstimator RetrainDnnModel( LearningRateOperation = learningRateOperation, LearningRate = learningRate, BatchSize = batchSize, - AddBatchDimensionInputs = addBatchDimensionInput, - ReTrain = true + AddBatchDimensionInputs = addBatchDimensionInput }; var env = CatalogUtils.GetEnvironment(catalog); - return new DnnEstimator(env, options, DnnUtils.LoadDnnModel(env, modelPath, true)); + return new DnnRetrainEstimator(env, options, DnnUtils.LoadDnnModel(env, modelPath, true)); } /// @@ -85,33 +83,50 @@ public static DnnEstimator RetrainDnnModel( /// /// The name of the input features column. /// The name of the labels column. - /// Optional name of the path where a copy new graph should be saved. The graph will be saved as part of model. /// The name of the output score column. /// The name of the output predicted label columns. - /// The name of the prefix for checkpoint files. /// The architecture of the image recognition DNN model. - /// The backend DNN framework to use, currently only Tensorflow is supported. - /// Number of training epochs. + /// Number of training iterations. Each iteration/epoch refers to one pass over the dataset. /// The batch size for training. /// The learning rate for training. + /// Callback for reporting model statistics during training phase. + /// Indicates the frequency of epochs at which to report model statistics during training phase. + /// Indicates the choice of DNN training framework. Currently only tensorflow is supported. + /// Optional name of the path where a copy new graph should be saved. The graph will be saved as part of model. + /// The name of the prefix for the final mode and checkpoint files. + /// Validation set. + /// Indicates to evaluate the model on train set after every epoch. + /// Indicates to not re-compute cached trainset bottleneck values if already available in the bin folder. + /// Indicates to not re-compute validataionset cached bottleneck validationset values if already available in the bin folder. + /// Indicates the file path to store trainset bottleneck values for caching. + /// Indicates the file path to store validationset bottleneck values for caching. /// /// The support for image classification is under preview. /// - public static DnnEstimator ImageClassification( + public static ImageClassificationEstimator ImageClassification( this ModelOperationsCatalog catalog, string featuresColumnName, string labelColumnName, - string outputGraphPath = null, string scoreColumnName = "Score", string predictedLabelColumnName = "PredictedLabel", - string checkpointName = "_retrain_checkpoint", - Architecture arch = Architecture.ResnetV2101, - DnnFramework dnnFramework = DnnFramework.Tensorflow, - int epoch = 10, - int batchSize = 20, - float learningRate = 0.01f) + Architecture arch = Architecture.InceptionV3, + int epoch = 100, + int batchSize = 10, + float learningRate = 0.01f, + ImageClassificationMetricsCallback metricsCallback = null, + int statisticFrequency = 1, + DnnFramework framework = DnnFramework.Tensorflow, + string modelSavePath = null, + string finalModelPrefix = "custom_retrained_model_based_on_", + IDataView validationSet = null, + bool testOnTrainSet = true, + bool reuseTrainSetBottleneckCachedValues = false, + bool reuseValidationSetBottleneckCachedValues = false, + string trainSetBottleneckCachedValuesFilePath = "trainSetBottleneckFile.csv", + string validationSetBottleneckCachedValuesFilePath = "validationSetBottleneckFile.csv" + ) { - var options = new Options() + var options = new ImageClassificationEstimator.Options() { ModelLocation = arch == Architecture.ResnetV2101 ? @"resnet_v2_101_299.meta" : @"InceptionV3.meta", InputColumns = new[] { featuresColumnName }, @@ -121,13 +136,20 @@ public static DnnEstimator ImageClassification( Epoch = epoch, LearningRate = learningRate, BatchSize = batchSize, - AddBatchDimensionInputs = arch == Architecture.InceptionV3 ? false : true, - TransferLearning = true, ScoreColumnName = scoreColumnName, PredictedLabelColumnName = predictedLabelColumnName, - CheckpointName = checkpointName, + FinalModelPrefix = finalModelPrefix, Arch = arch, - MeasureTrainAccuracy = false + MetricsCallback = metricsCallback, + StatisticsFrequency = statisticFrequency, + Framework = framework, + ModelSavePath = modelSavePath, + ValidationSet = validationSet, + TestOnTrainSet = testOnTrainSet, + TrainSetBottleneckCachedValuesFilePath = trainSetBottleneckCachedValuesFilePath, + ValidationSetBottleneckCachedValuesFilePath = validationSetBottleneckCachedValuesFilePath, + ReuseTrainSetBottleneckCachedValues = reuseTrainSetBottleneckCachedValues, + ReuseValidationSetBottleneckCachedValues = reuseValidationSetBottleneckCachedValues }; if (!File.Exists(options.ModelLocation)) @@ -158,7 +180,7 @@ public static DnnEstimator ImageClassification( } var env = CatalogUtils.GetEnvironment(catalog); - return new DnnEstimator(env, options, DnnUtils.LoadDnnModel(env, options.ModelLocation, true)); + return new ImageClassificationEstimator(env, options, DnnUtils.LoadDnnModel(env, options.ModelLocation, true)); } } } diff --git a/src/Microsoft.ML.Dnn/DnnModel.cs b/src/Microsoft.ML.Dnn/DnnModel.cs index 6f8c54edb7..a5324e9e39 100644 --- a/src/Microsoft.ML.Dnn/DnnModel.cs +++ b/src/Microsoft.ML.Dnn/DnnModel.cs @@ -4,14 +4,14 @@ using Microsoft.ML.Runtime; using Tensorflow; -using static Microsoft.ML.Transforms.DnnEstimator; +using static Microsoft.ML.Transforms.DnnRetrainEstimator; namespace Microsoft.ML.Transforms { /// /// This class holds the information related to TensorFlow model and session. /// It provides some convenient methods to query model schema as well as - /// creation of object. + /// creation of object. /// public sealed class DnnModel { diff --git a/src/Microsoft.ML.Dnn/DnnTransform.cs b/src/Microsoft.ML.Dnn/DnnRetrainTransform.cs similarity index 64% rename from src/Microsoft.ML.Dnn/DnnTransform.cs rename to src/Microsoft.ML.Dnn/DnnRetrainTransform.cs index 56b3e13fd1..99cbd1afa5 100644 --- a/src/Microsoft.ML.Dnn/DnnTransform.cs +++ b/src/Microsoft.ML.Dnn/DnnRetrainTransform.cs @@ -8,7 +8,6 @@ using System.IO; using System.Linq; using System.Text; -using Google.Protobuf; using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; @@ -18,33 +17,29 @@ using Microsoft.ML.Transforms.Dnn; using NumSharp; using Tensorflow; -using Tensorflow.Summaries; using static Microsoft.ML.Transforms.Dnn.DnnUtils; -using static Microsoft.ML.Transforms.DnnEstimator; -using static Tensorflow.Python; -[assembly: LoadableClass(DnnTransformer.Summary, typeof(IDataTransform), typeof(DnnTransformer), - typeof(DnnEstimator.Options), typeof(SignatureDataTransform), DnnTransformer.UserName, DnnTransformer.ShortName)] +[assembly: LoadableClass(DnnRetrainTransformer.Summary, typeof(IDataTransform), typeof(DnnRetrainTransformer), + typeof(DnnRetrainEstimator.Options), typeof(SignatureDataTransform), DnnRetrainTransformer.UserName, DnnRetrainTransformer.ShortName)] -[assembly: LoadableClass(DnnTransformer.Summary, typeof(IDataTransform), typeof(DnnTransformer), null, typeof(SignatureLoadDataTransform), - DnnTransformer.UserName, DnnTransformer.LoaderSignature)] +[assembly: LoadableClass(DnnRetrainTransformer.Summary, typeof(IDataTransform), typeof(DnnRetrainTransformer), null, typeof(SignatureLoadDataTransform), + DnnRetrainTransformer.UserName, DnnRetrainTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(DnnTransformer), null, typeof(SignatureLoadModel), - DnnTransformer.UserName, DnnTransformer.LoaderSignature)] +[assembly: LoadableClass(typeof(DnnRetrainTransformer), null, typeof(SignatureLoadModel), + DnnRetrainTransformer.UserName, DnnRetrainTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(IRowMapper), typeof(DnnTransformer), null, typeof(SignatureLoadRowMapper), - DnnTransformer.UserName, DnnTransformer.LoaderSignature)] +[assembly: LoadableClass(typeof(IRowMapper), typeof(DnnRetrainTransformer), null, typeof(SignatureLoadRowMapper), + DnnRetrainTransformer.UserName, DnnRetrainTransformer.LoaderSignature)] namespace Microsoft.ML.Transforms { /// - /// for the . + /// for the . /// - public sealed class DnnTransformer : RowToRowTransformerBase + public sealed class DnnRetrainTransformer : RowToRowTransformerBase { private readonly IHostEnvironment _env; private readonly string _modelLocation; - private readonly bool _transferLearning; private readonly bool _isTemporarySavedModel; private readonly bool _addBatchDimensionInput; private Session _session; @@ -56,33 +51,15 @@ public sealed class DnnTransformer : RowToRowTransformerBase private readonly (Operation, int)[] _tfOutputOperations; private TF_Output[] _tfInputNodes; private readonly TF_Output[] _tfOutputNodes; - private Tensor _bottleneckTensor; - private Operation _trainStep; - private Tensor _softMaxTensor; - private Tensor _crossEntropy; - private Tensor _labelTensor; - private Tensor _evaluationStep; - private Tensor _prediction; - private readonly int _classCount; - private readonly string _checkpointPath; - private readonly string _bottleneckOperationName; private Graph Graph => _session.graph; private readonly Dictionary _idvToTfMapping; private readonly string[] _inputs; private readonly string[] _outputs; - private readonly string _labelColumnName; - private readonly string _checkpointName; - private readonly Architecture _arch; - private readonly string _scoreColumnName; - private readonly string _predictedLabelColumnName; - private readonly float _learningRate; - private readonly string _softmaxTensorName; - private readonly string _predictionTensorName; - - internal const string Summary = "Trains Dnn models."; - internal const string UserName = "DnnTransform"; - internal const string ShortName = "DnnTransform"; - internal const string LoaderSignature = "DnnTransform"; + + internal const string Summary = "Re-Trains Dnn models."; + internal const string UserName = "DnnRtTransform"; + internal const string ShortName = "DnnRtTransform"; + internal const string LoaderSignature = "DnnRtTransform"; internal static class DefaultModelFileNames { @@ -102,11 +79,11 @@ private static VersionInfo GetVersionInfo() verReadableCur: 0x00000001, verWeCanReadBack: 0x00000001, loaderSignature: LoaderSignature, - loaderAssemblyName: typeof(DnnTransformer).Assembly.FullName); + loaderAssemblyName: typeof(DnnRetrainTransformer).Assembly.FullName); } // Factory method for SignatureLoadModel. - private static DnnTransformer Create(IHostEnvironment env, ModelLoadContext ctx) + private static DnnRetrainTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ctx, nameof(ctx)); @@ -123,9 +100,7 @@ private static DnnTransformer Create(IHostEnvironment env, ModelLoadContext ctx) // int: id of output column name // stream: tensorFlow model. - GetModelInfo(env, ctx, out string[] inputs, out string[] outputs, out bool isFrozen, out bool addBatchDimensionInput, - out bool transferLearning, out string labelColumn, out string checkpointName, out Architecture arch, out string scoreColumnName, - out string predictedColumnName, out float learningRate, out int classCount, out string predictionTensorName, out string softMaxTensorName); + GetModelInfo(env, ctx, out string[] inputs, out string[] outputs, out bool isFrozen, out bool addBatchDimensionInput); if (isFrozen) { @@ -133,12 +108,11 @@ private static DnnTransformer Create(IHostEnvironment env, ModelLoadContext ctx) if (!ctx.TryLoadBinaryStream("TFModel", r => modelBytes = r.ReadByteArray())) throw env.ExceptDecode(); - return new DnnTransformer(env, DnnUtils.LoadTFSession(env, modelBytes), outputs, inputs, - null, false, addBatchDimensionInput, 1, transferLearning, labelColumn, checkpointName, arch, - scoreColumnName, predictedColumnName, learningRate, null, classCount, true, predictionTensorName, softMaxTensorName); + return new DnnRetrainTransformer(env, DnnUtils.LoadTFSession(env, modelBytes), outputs, inputs, + null, false, addBatchDimensionInput, 1); } - var tempDirPath = Path.GetFullPath(Path.Combine(Path.GetTempPath(), nameof(DnnTransformer) + "_" + Guid.NewGuid())); + var tempDirPath = Path.GetFullPath(Path.Combine(Path.GetTempPath(), nameof(DnnRetrainTransformer) + "_" + Guid.NewGuid())); DnnUtils.CreateFolderWithAclIfNotExists(env, tempDirPath); try { @@ -164,9 +138,8 @@ private static DnnTransformer Create(IHostEnvironment env, ModelLoadContext ctx) } }); - return new DnnTransformer(env, DnnUtils.GetSession(env, tempDirPath), outputs, inputs, tempDirPath, true, - addBatchDimensionInput, 1, transferLearning, labelColumn, checkpointName, arch, - scoreColumnName, predictedColumnName, learningRate, null, classCount, true, predictionTensorName, softMaxTensorName); + return new DnnRetrainTransformer(env, DnnUtils.GetSession(env, tempDirPath), outputs, inputs, tempDirPath, true, + addBatchDimensionInput, 1); } catch (Exception) { @@ -176,7 +149,7 @@ private static DnnTransformer Create(IHostEnvironment env, ModelLoadContext ctx) } // Factory method for SignatureDataTransform. - internal static IDataTransform Create(IHostEnvironment env, DnnEstimator.Options options, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, DnnRetrainEstimator.Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(options, nameof(options)); @@ -184,33 +157,30 @@ internal static IDataTransform Create(IHostEnvironment env, DnnEstimator.Options env.CheckValue(options.InputColumns, nameof(options.InputColumns)); env.CheckValue(options.OutputColumns, nameof(options.OutputColumns)); - return new DnnTransformer(env, options, input).MakeDataTransform(input); + return new DnnRetrainTransformer(env, options, input).MakeDataTransform(input); } - internal DnnTransformer(IHostEnvironment env, DnnEstimator.Options options, IDataView input) + internal DnnRetrainTransformer(IHostEnvironment env, DnnRetrainEstimator.Options options, IDataView input) : this(env, options, DnnUtils.LoadDnnModel(env, options.ModelLocation), input) { } - internal DnnTransformer(IHostEnvironment env, DnnEstimator.Options options, DnnModel tensorFlowModel, IDataView input, IDataView validationSet = null) + internal DnnRetrainTransformer(IHostEnvironment env, DnnRetrainEstimator.Options options, DnnModel tensorFlowModel, IDataView input, IDataView validationSet = null) : this(env, tensorFlowModel.Session, options.OutputColumns, options.InputColumns, - options.ModelLocation, false, options.AddBatchDimensionInputs, options.BatchSize, options.TransferLearning, - options.LabelColumn, options.CheckpointName, options.Arch, options.ScoreColumnName, - options.PredictedLabelColumnName, options.LearningRate, input.Schema) + options.ModelLocation, false, options.AddBatchDimensionInputs, options.BatchSize) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - if (options.ReTrain) - CheckTrainingParameters(options); + CheckTrainingParameters(options); - if (options.ReTrain && !DnnUtils.IsSavedModel(env, options.ModelLocation)) + if (!DnnUtils.IsSavedModel(env, options.ModelLocation)) throw env.ExceptNotSupp("TensorFlowTransform: Re-Training of TensorFlow model is only supported for un-frozen model."); TrainCore(options, input, validationSet); } - private void CheckTrainingParameters(DnnEstimator.Options options) + private void CheckTrainingParameters(DnnRetrainEstimator.Options options) { Host.CheckNonWhiteSpace(options.LabelColumn, nameof(options.LabelColumn)); Host.CheckNonWhiteSpace(options.OptimizationOperation, nameof(options.OptimizationOperation)); @@ -296,7 +266,7 @@ private void CheckTrainingParameters(DnnEstimator.Options options) return (inputColIndex, isInputVector, tfInputType, tfInputShape); } - private void TrainCore(DnnEstimator.Options options, IDataView input, IDataView validationSet) + private void TrainCore(DnnRetrainEstimator.Options options, IDataView input, IDataView validationSet) { var inputsForTraining = new string[_inputs.Length + 1]; var inputColIndices = new int[inputsForTraining.Length]; @@ -313,10 +283,7 @@ private void TrainCore(DnnEstimator.Options options, IDataView input, IDataView GetTrainingInputInfo(inputSchema, _inputs[i], inputsForTraining[i], options.BatchSize); var index = inputsForTraining.Length - 1; - if (options.TransferLearning) - inputsForTraining[index] = _labelTensor.name.Split(':').First(); - else - inputsForTraining[index] = options.TensorFlowLabel; + inputsForTraining[index] = options.TensorFlowLabel; (inputColIndices[index], isInputVector[index], tfInputTypes[index], tfInputShapes[index]) = GetTrainingInputInfo(inputSchema, options.LabelColumn, inputsForTraining[index], options.BatchSize); @@ -324,14 +291,9 @@ private void TrainCore(DnnEstimator.Options options, IDataView input, IDataView // Create graph inputs. Operation labelOp; int labelOpIdx; - if (options.ReTrain) - (labelOp, labelOpIdx) = GetOperationFromName(options.TensorFlowLabel, _session); - else - (labelOp, labelOpIdx) = GetOperationFromName(_labelTensor.name, _session); - + (labelOp, labelOpIdx) = GetOperationFromName(options.TensorFlowLabel, _session); TF_Output[] tfInputs; - - if (options.ReTrain && !string.IsNullOrEmpty(options.LearningRateOperation)) + if (!string.IsNullOrEmpty(options.LearningRateOperation)) tfInputs = new TF_Output[_tfInputNodes.Length + 2]; //Inputs + Label + Learning Rate. else tfInputs = new TF_Output[_tfInputNodes.Length + 1]; //Inputs + Label. @@ -339,32 +301,13 @@ private void TrainCore(DnnEstimator.Options options, IDataView input, IDataView Array.Copy(_tfInputNodes, tfInputs, _tfInputNodes.Length); tfInputs[_tfInputNodes.Length] = new TF_Output(labelOp, labelOpIdx); - - if (options.ReTrain) - { - var lr = GetOperationFromName(options.LearningRateOperation, _session); - tfInputs[_tfInputNodes.Length + 1] = new TF_Output(lr.Item1, lr.Item2); - } + var lr = GetOperationFromName(options.LearningRateOperation, _session); + tfInputs[_tfInputNodes.Length + 1] = new TF_Output(lr.Item1, lr.Item2); // Create graph operations. IntPtr[] ops = null; - if (options.ReTrain && options.OptimizationOperation != null) + if (options.OptimizationOperation != null) ops = new[] { c_api.TF_GraphOperationByName(Graph, options.OptimizationOperation) }; - else - ops = new[] { (IntPtr)_trainStep }; - - Saver trainSaver = null; - FileWriter trainWriter = null; - Tensor merged = null; - Runner testSetRunner = null; - Runner validationSetRunner = null; - if (options.TransferLearning) - { - merged = tf.summary.merge_all(); - trainWriter = tf.summary.FileWriter(Path.Combine(Directory.GetCurrentDirectory(), "train"), _session.graph); - trainSaver = tf.train.Saver(); - trainSaver.save(_session, _checkpointPath); - } // Instantiate the graph. Runner runner; @@ -379,190 +322,53 @@ private void TrainCore(DnnEstimator.Options options, IDataView input, IDataView using (var ch = Host.Start("Training TensorFlow model...")) using (var pch = Host.StartProgressChannel("TensorFlow training progress...")) { - if (options.ReTrain) - { - float loss = 0; - float metric = 0; - pch.SetHeader(new ProgressHeader(new[] { "Loss", "Metric" }, new[] { "Epoch" }), (e) => e.SetProgress(0, epoch, options.Epoch)); + float loss = 0; + float metric = 0; + pch.SetHeader(new ProgressHeader(new[] { "Loss", "Metric" }, new[] { "Epoch" }), (e) => e.SetProgress(0, epoch, options.Epoch)); - while (cursor.MoveNext()) - { - for (int i = 0; i < inputsForTraining.Length; i++) - { - isDataLeft = true; - srcTensorGetters[i].BufferTrainingData(); - } - - if (((cursor.Position + 1) % options.BatchSize) == 0) - { - isDataLeft = false; - runner = new Runner(_session); - - // Add Learning Rate. - if (!string.IsNullOrEmpty(options.LearningRateOperation)) - runner.AddInput(options.LearningRateOperation, new Tensor(options.LearningRate)); - - // Add operations. - if (!string.IsNullOrEmpty(options.OptimizationOperation)) - runner.AddOperation(options.OptimizationOperation); - - // Add outputs. - if (options.LossOperation != null) - runner.AddOutputs(options.LossOperation); - if (options.MetricOperation != null) - runner.AddOutputs(options.MetricOperation); - - var (l, m) = ExecuteGraphAndRetrieveMetrics(inputsForTraining, srcTensorGetters, runner); - loss += l; - metric += m; - } - } - if (isDataLeft) - { - isDataLeft = false; - ch.Warning("Not training on the last batch. The batch size is less than {0}.", options.BatchSize); - } - pch.Checkpoint(new double?[] { loss, metric }); - } - else + while (cursor.MoveNext()) { - pch.SetHeader(new ProgressHeader(null, new[] { "Epoch" }), (e) => e.SetProgress(0, epoch, options.Epoch)); - - while (cursor.MoveNext()) + for (int i = 0; i < inputsForTraining.Length; i++) { - for (int i = 0; i < inputsForTraining.Length; i++) - { - isDataLeft = true; - srcTensorGetters[i].BufferTrainingData(); - } - - if (((cursor.Position + 1) % options.BatchSize) == 0) - { - isDataLeft = false; - runner = new Runner(_session); - - // Add operations. - runner.AddOperation(_trainStep); - - // Feed inputs. - for (int i = 0; i < inputsForTraining.Length; i++) - runner.AddInput(inputsForTraining[i], srcTensorGetters[i].GetBufferedBatchTensor()); - - // Execute the graph. - var t = runner.Run(); - } + isDataLeft = true; + srcTensorGetters[i].BufferTrainingData(); } - if (isDataLeft) + if (((cursor.Position + 1) % options.BatchSize) == 0) { isDataLeft = false; - ch.Warning("Not training on the last batch. The batch size is less than {0}.", options.BatchSize); - } - } - } - } + runner = new Runner(_session); - // Measure accuracy of the model. - if (options.TransferLearning && options.MeasureTrainAccuracy) - { - // Test on the training set to get accuracy. - using (var cursor = input.GetRowCursor(cols)) - { - var srcTensorGetters = GetTensorValueGetters(cursor, inputColIndices, isInputVector, tfInputTypes, tfInputShapes); - - float accuracy = 0; - float crossEntropy = 0; - bool isDataLeft = false; - int batch = 0; - using (var ch = Host.Start("Test TensorFlow model...")) - using (var pch = Host.StartProgressChannel("TensorFlow testing progress...")) - { - pch.SetHeader(new ProgressHeader(new[] { "Accuracy", "Cross Entropy" }, new[] { "Epoch" }), (e) => e.SetProgress(0, epoch, options.Epoch)); + // Add Learning Rate. + if (!string.IsNullOrEmpty(options.LearningRateOperation)) + runner.AddInput(options.LearningRateOperation, new Tensor(options.LearningRate)); - while (cursor.MoveNext()) - { - for (int i = 0; i < inputColIndices.Length; i++) - { - isDataLeft = true; - srcTensorGetters[i].BufferTrainingData(); - } - - if (((cursor.Position + 1) % options.BatchSize) == 0) - { - isDataLeft = false; - testSetRunner = new Runner(_session); - testSetRunner.AddOutputs(_evaluationStep.name); - testSetRunner.AddOutputs(_crossEntropy.name); - testSetRunner.AddOutputs(_bottleneckTensor.name); - var (acc, ce) = ExecuteGraphAndRetrieveMetrics(inputsForTraining, srcTensorGetters, testSetRunner); - accuracy += acc; - crossEntropy += ce; - batch++; - } - } + // Add operations. + if (!string.IsNullOrEmpty(options.OptimizationOperation)) + runner.AddOperation(options.OptimizationOperation); - if (isDataLeft) - { - isDataLeft = false; - ch.Warning("Not training on the last batch. The batch size is less than {0}.", options.BatchSize); + // Add outputs. + if (options.LossOperation != null) + runner.AddOutputs(options.LossOperation); + if (options.MetricOperation != null) + runner.AddOutputs(options.MetricOperation); + + var (l, m) = ExecuteGraphAndRetrieveMetrics(inputsForTraining, srcTensorGetters, runner); + loss += l; + metric += m; } - pch.Checkpoint(new double?[] { accuracy / batch, crossEntropy / batch }); - ch.Info(MessageSensitivity.None, $"Accuracy: {accuracy / batch}, Cross-Entropy: {crossEntropy / batch}"); } - } - - // Test on the validation set. - if (validationSet != null) - { - using (var cursor = validationSet.GetRowCursor(cols)) + if (isDataLeft) { - var srcTensorGetters = GetTensorValueGetters(cursor, inputColIndices, isInputVector, tfInputTypes, tfInputShapes); - - float accuracy = 0; - bool isDataLeft = false; - int batch = 0; - using (var ch = Host.Start("Test TensorFlow model with validation set...")) - using (var pch = Host.StartProgressChannel("TensorFlow validation progress...")) - { - pch.SetHeader(new ProgressHeader(new[] { "Accuracy" }, new[] { "Epoch" }), (e) => e.SetProgress(0, epoch, options.Epoch)); - - while (cursor.MoveNext()) - { - for (int i = 0; i < inputColIndices.Length; i++) - { - isDataLeft = true; - srcTensorGetters[i].BufferTrainingData(); - } - - if (((cursor.Position + 1) % options.BatchSize) == 0) - { - isDataLeft = false; - validationSetRunner = new Runner(_session); - validationSetRunner.AddOutputs(_evaluationStep.name); - var (acc, _) = ExecuteGraphAndRetrieveMetrics(inputsForTraining, srcTensorGetters, validationSetRunner); - accuracy += acc; - batch++; - } - } - if (isDataLeft) - { - isDataLeft = false; - ch.Warning("Not training on the last batch. The batch size is less than {0}.", options.BatchSize); - } - pch.Checkpoint(new double?[] { accuracy / batch }); - } + isDataLeft = false; + ch.Warning("Not training on the last batch. The batch size is less than {0}.", options.BatchSize); } + pch.Checkpoint(new double?[] { loss, metric }); } } } - if (options.ReTrain) - UpdateModelOnDisk(options.ModelLocation, options); - else - { - trainSaver.save(_session, _checkpointPath); - UpdateTransferLearningModelOnDisk(options, _classCount); - } + UpdateModelOnDisk(options.ModelLocation, options); } private (float loss, float metric) ExecuteGraphAndRetrieveMetrics( @@ -588,7 +394,7 @@ private void TrainCore(DnnEstimator.Options options, IDataView input, IDataView /// After retraining Session and Graphs are both up-to-date /// However model on disk is not which is used to serialzed to ML.Net stream /// - private void UpdateModelOnDisk(string modelDir, DnnEstimator.Options options) + private void UpdateModelOnDisk(string modelDir, DnnRetrainEstimator.Options options) { try { @@ -648,150 +454,6 @@ private void UpdateModelOnDisk(string modelDir, DnnEstimator.Options options) } } - private (Session, Tensor, Tensor, Tensor) BuildEvaluationSession(DnnEstimator.Options options, int classCount) - { - var evalGraph = DnnUtils.LoadMetaGraph(options.ModelLocation); - var evalSess = tf.Session(graph: evalGraph); - Tensor evaluationStep = null; - Tensor prediction = null; - Tensor bottleneckTensor = evalGraph.OperationByName(_bottleneckOperationName); - - tf_with(evalGraph.as_default(), graph => - { - var (_, _, groundTruthInput, finalTensor) = AddFinalRetrainOps(classCount, options.LabelColumn, - options.ScoreColumnName, options.LearningRate, bottleneckTensor, false); - - tf.train.Saver().restore(evalSess, Path.Combine(Directory.GetCurrentDirectory(), _checkpointPath)); - (evaluationStep, prediction) = AddEvaluationStep(finalTensor, groundTruthInput); - }); - - return (evalSess, _labelTensor, evaluationStep, prediction); - } - - private (Tensor, Tensor) AddEvaluationStep(Tensor resultTensor, Tensor groundTruthTensor) - { - Tensor evaluationStep = null; - Tensor correctPrediction = null; - - tf_with(tf.name_scope("accuracy"), scope => - { - tf_with(tf.name_scope("correct_prediction"), delegate - { - _prediction = tf.argmax(resultTensor, 1); - correctPrediction = tf.equal(_prediction, groundTruthTensor); - }); - - tf_with(tf.name_scope("accuracy"), delegate - { - evaluationStep = tf.reduce_mean(tf.cast(correctPrediction, tf.float32)); - }); - }); - - tf.summary.scalar("accuracy", evaluationStep); - return (evaluationStep, _prediction); - } - - private void UpdateTransferLearningModelOnDisk(DnnEstimator.Options options, int classCount) - { - var (sess, _, _, _) = BuildEvaluationSession(options, classCount); - var graph = sess.graph; - var outputGraphDef = tf.graph_util.convert_variables_to_constants( - sess, graph.as_graph_def(), new string[] { _softMaxTensor.name.Split(':')[0], _prediction.name.Split(':')[0] }); - - string frozenModelPath = _checkpointPath + ".pb"; - File.WriteAllBytes(_checkpointPath + ".pb", outputGraphDef.ToByteArray()); - _session = LoadTFSessionByModelFilePath(_env, frozenModelPath, false); - } - - private void VariableSummaries(RefVariable var) - { - tf_with(tf.name_scope("summaries"), delegate - { - var mean = tf.reduce_mean(var); - tf.summary.scalar("mean", mean); - Tensor stddev = null; - tf_with(tf.name_scope("stddev"), delegate - { - stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))); - }); - tf.summary.scalar("stddev", stddev); - tf.summary.scalar("max", tf.reduce_max(var)); - tf.summary.scalar("min", tf.reduce_min(var)); - tf.summary.histogram("histogram", var); - }); - } - - private (Operation, Tensor, Tensor, Tensor) AddFinalRetrainOps(int classCount, string labelColumn, - string scoreColumnName, float learningRate, Tensor bottleneckTensor, bool isTraining) - { - var (batch_size, bottleneck_tensor_size) = (bottleneckTensor.TensorShape.Dimensions[0], bottleneckTensor.TensorShape.Dimensions[1]); - tf_with(tf.name_scope("input"), scope => - { - _labelTensor = tf.placeholder(tf.int64, new TensorShape(batch_size), name: labelColumn); - }); - - string layerName = "final_retrain_ops"; - Tensor logits = null; - tf_with(tf.name_scope(layerName), scope => - { - RefVariable layerWeights = null; - tf_with(tf.name_scope("weights"), delegate - { - var initialValue = tf.truncated_normal(new int[] { bottleneck_tensor_size, classCount }, stddev: 0.001f); - layerWeights = tf.Variable(initialValue, name: "final_weights"); - VariableSummaries(layerWeights); - }); - - RefVariable layerBiases = null; - tf_with(tf.name_scope("biases"), delegate - { - layerBiases = tf.Variable(tf.zeros(classCount), name: "final_biases"); - VariableSummaries(layerBiases); - }); - - tf_with(tf.name_scope("Wx_plus_b"), delegate - { - var matmul = tf.matmul(bottleneckTensor, layerWeights); - logits = matmul + layerBiases; - tf.summary.histogram("pre_activations", logits); - }); - }); - - _softMaxTensor = tf.nn.softmax(logits, name: scoreColumnName); - - tf.summary.histogram("activations", _softMaxTensor); - if (!isTraining) - return (null, null, _labelTensor, _softMaxTensor); - - Tensor crossEntropyMean = null; - tf_with(tf.name_scope("cross_entropy"), delegate - { - crossEntropyMean = tf.losses.sparse_softmax_cross_entropy( - labels: _labelTensor, logits: logits); - }); - - tf.summary.scalar("cross_entropy", crossEntropyMean); - - tf_with(tf.name_scope("train"), delegate - { - var optimizer = tf.train.GradientDescentOptimizer(learningRate); - _trainStep = optimizer.minimize(crossEntropyMean); - }); - - return (_trainStep, crossEntropyMean, _labelTensor, _softMaxTensor); - } - - private void AddTransferLearningLayer(string labelColumn, - string scoreColumnName, float learningRate, int classCount) - { - _bottleneckTensor = Graph.OperationByName(_bottleneckOperationName); - tf_with(Graph.as_default(), delegate - { - (_trainStep, _crossEntropy, _labelTensor, _softMaxTensor) = - AddFinalRetrainOps(classCount, labelColumn, scoreColumnName, learningRate, _bottleneckTensor, true); - }); - } - private static ITensorValueGetter CreateTensorValueGetter(DataViewRow input, bool isVector, int colIndex, TensorShape tfShape, bool keyType = false) { if (isVector) @@ -833,9 +495,7 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat => Create(env, ctx).MakeRowMapper(inputSchema); private static void GetModelInfo(IHostEnvironment env, ModelLoadContext ctx, out string[] inputs, - out string[] outputs, out bool isFrozen, out bool addBatchDimensionInput, out bool transferLearning, - out string labelColumn, out string checkpointName, out Architecture arch, - out string scoreColumnName, out string predictedColumnName, out float learningRate, out int classCount, out string predictionTensorName, out string softMaxTensorName) + out string[] outputs, out bool isFrozen, out bool addBatchDimensionInput) { isFrozen = ctx.Reader.ReadBoolByte(); addBatchDimensionInput = ctx.Reader.ReadBoolByte(); @@ -851,26 +511,12 @@ private static void GetModelInfo(IHostEnvironment env, ModelLoadContext ctx, out outputs = new string[numOutputs]; for (int j = 0; j < outputs.Length; j++) outputs[j] = ctx.LoadNonEmptyString(); - - transferLearning = ctx.Reader.ReadBoolean(); - labelColumn = ctx.Reader.ReadString(); - checkpointName = ctx.Reader.ReadString(); - arch = (Architecture)ctx.Reader.ReadInt32(); - scoreColumnName = ctx.Reader.ReadString(); - predictedColumnName = ctx.Reader.ReadString(); - learningRate = ctx.Reader.ReadFloat(); - classCount = ctx.Reader.ReadInt32(); - predictionTensorName = ctx.Reader.ReadString(); - softMaxTensorName = ctx.Reader.ReadString(); - } - internal DnnTransformer(IHostEnvironment env, Session session, string[] outputColumnNames, + internal DnnRetrainTransformer(IHostEnvironment env, Session session, string[] outputColumnNames, string[] inputColumnNames, string modelLocation, bool isTemporarySavedModel, - bool addBatchDimensionInput, int batchSize, bool transferLearning, string labelColumnName, string checkpointName, Architecture arch, - string scoreColumnName, string predictedLabelColumnName, float learningRate, DataViewSchema inputSchema, int? classCount = null, bool loadModel = false, - string predictionTensorName = null, string softMaxTensorName = null) - : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(DnnTransformer))) + bool addBatchDimensionInput, int batchSize) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(DnnRetrainTransformer))) { Host.CheckValue(session, nameof(session)); @@ -885,76 +531,15 @@ internal DnnTransformer(IHostEnvironment env, Session session, string[] outputCo _inputs = inputColumnNames; _outputs = outputColumnNames; _idvToTfMapping = new Dictionary(); - _transferLearning = transferLearning; - _labelColumnName = labelColumnName; - _checkpointName = checkpointName; - _arch = arch; - _scoreColumnName = scoreColumnName; - _predictedLabelColumnName = predictedLabelColumnName; - _learningRate = learningRate; - _softmaxTensorName = softMaxTensorName; - _predictionTensorName = predictionTensorName; - if (transferLearning) - { - if (classCount == null) - { - var labelColumn = inputSchema.GetColumnOrNull(labelColumnName).Value; - var labelType = labelColumn.Type; - var labelCount = labelType.GetKeyCount(); - if (labelCount <= 0) - throw Host.ExceptSchemaMismatch(nameof(inputSchema), "label", (string)labelColumn.Name, "Key", (string)labelType.ToString()); - - _classCount = labelCount == 1 ? 2 : (int)labelCount; - } - else - _classCount = classCount.Value; - _checkpointPath = Path.Combine(Directory.GetCurrentDirectory(), modelLocation + checkpointName); + foreach (var x in _inputs) + _idvToTfMapping[x] = x; - // Configure bottleneck tensor based on the model. - if (arch == DnnEstimator.Architecture.ResnetV2101) - _bottleneckOperationName = "resnet_v2_101/SpatialSqueeze"; - else if(arch == DnnEstimator.Architecture.InceptionV3) - _bottleneckOperationName = "module_apply_default/hub_output/feature_vector/SpatialSqueeze"; + foreach (var x in _outputs) + _idvToTfMapping[x] = x; - if (arch == DnnEstimator.Architecture.ResnetV2101) - _idvToTfMapping[_inputs[0]] = "input"; - else if (arch == DnnEstimator.Architecture.InceptionV3) - _idvToTfMapping[_inputs[0]] = "Placeholder"; + (_tfOutputTypes, _outputTypes, _tfOutputOperations) = GetOutputInfo(Host, _session, _outputs); - _outputs = new[] { scoreColumnName, predictedLabelColumnName }; - - if (loadModel == false) - { - // Add transfer learning layer. - AddTransferLearningLayer(labelColumnName, scoreColumnName, learningRate, _classCount); - - // Initialize the variables. - new Runner(_session).AddOperation(tf.global_variables_initializer()).Run(); - - // Add evaluation layer. - (_evaluationStep, _) = AddEvaluationStep(_softMaxTensor, _labelTensor); - _softmaxTensorName = _softMaxTensor.name; - _predictionTensorName = _prediction.name; - } - - _idvToTfMapping[scoreColumnName] = _softmaxTensorName; - _idvToTfMapping[predictedLabelColumnName] = _predictionTensorName; - - (_tfOutputTypes, _outputTypes, _tfOutputOperations) = GetOutputInfo(Host, _session, new[] { _softmaxTensorName, _predictionTensorName }); - _transferLearning = true; - } - else - { - foreach (var x in _inputs) - _idvToTfMapping[x] = x; - - foreach (var x in _outputs) - _idvToTfMapping[x] = x; - - (_tfOutputTypes, _outputTypes, _tfOutputOperations) = GetOutputInfo(Host, _session, _outputs); - - } (_tfInputTypes, _tfInputShapes, _tfInputOperations) = GetInputInfo(Host, _session, _inputs.Select(x => _idvToTfMapping[x]).ToArray(), batchSize); _tfInputNodes = new TF_Output[_inputs.Length]; @@ -1093,7 +678,7 @@ private protected override void SaveModel(ModelSaveContext ctx) // for each output column // int: id of output column name // stream: tensorFlow model. - var isFrozen = _transferLearning || DnnUtils.IsSavedModel(_env, _modelLocation); + var isFrozen = DnnUtils.IsSavedModel(_env, _modelLocation); ctx.Writer.WriteBoolByte(isFrozen); ctx.Writer.WriteBoolByte(_addBatchDimensionInput); @@ -1107,58 +692,35 @@ private protected override void SaveModel(ModelSaveContext ctx) foreach (var colName in _outputs) ctx.SaveNonEmptyString(colName); - ctx.Writer.Write(_transferLearning); - ctx.Writer.Write(_labelColumnName); - ctx.Writer.Write(_checkpointName); - ctx.Writer.Write((int)_arch); - ctx.Writer.Write(_scoreColumnName); - ctx.Writer.Write(_predictedLabelColumnName); - ctx.Writer.Write(_learningRate); - ctx.Writer.Write(_classCount); - ctx.Writer.Write(_predictionTensorName); - ctx.Writer.Write(_softmaxTensorName); - - if (isFrozen || _transferLearning) - { - Status status = new Status(); - var buffer = _session.graph.ToGraphDef(status); - ctx.SaveBinaryStream("TFModel", w => - { - w.WriteByteArray(buffer.Data); - }); - } - else + ctx.SaveBinaryStream("TFSavedModel", w => { - ctx.SaveBinaryStream("TFSavedModel", w => + // only these files need to be saved. + string[] modelFilePaths = { - // only these files need to be saved. - string[] modelFilePaths = - { - Path.Combine(_modelLocation, DefaultModelFileNames.Graph), - Path.Combine(_modelLocation, DefaultModelFileNames.VariablesFolder, DefaultModelFileNames.Data), - Path.Combine(_modelLocation, DefaultModelFileNames.VariablesFolder, DefaultModelFileNames.Index), - }; + Path.Combine(_modelLocation, DefaultModelFileNames.Graph), + Path.Combine(_modelLocation, DefaultModelFileNames.VariablesFolder, DefaultModelFileNames.Data), + Path.Combine(_modelLocation, DefaultModelFileNames.VariablesFolder, DefaultModelFileNames.Index), + }; - w.Write(modelFilePaths.Length); + w.Write(modelFilePaths.Length); - foreach (var fullPath in modelFilePaths) - { - var relativePath = fullPath.Substring(_modelLocation.Length + 1); - w.Write(relativePath); + foreach (var fullPath in modelFilePaths) + { + var relativePath = fullPath.Substring(_modelLocation.Length + 1); + w.Write(relativePath); - using (var fs = new FileStream(fullPath, FileMode.Open)) - { - long fileLength = fs.Length; - w.Write(fileLength); - long actualWritten = fs.CopyRange(w.BaseStream, fileLength); - Host.Assert(actualWritten == fileLength); - } + using (var fs = new FileStream(fullPath, FileMode.Open)) + { + long fileLength = fs.Length; + w.Write(fileLength); + long actualWritten = fs.CopyRange(w.BaseStream, fileLength); + Host.Assert(actualWritten == fileLength); } - }); - } + } + }); } - ~DnnTransformer() + ~DnnRetrainTransformer() { Dispose(false); } @@ -1187,13 +749,13 @@ private void Dispose(bool disposing) private sealed class Mapper : MapperBase { - private readonly DnnTransformer _parent; + private readonly DnnRetrainTransformer _parent; private readonly int[] _inputColIndices; private readonly bool[] _isInputVector; private readonly TensorShape[] _fullySpecifiedShapes; private readonly ConcurrentBag _runners; - public Mapper(DnnTransformer parent, DataViewSchema inputSchema) : + public Mapper(DnnRetrainTransformer parent, DataViewSchema inputSchema) : base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent) { Host.CheckValue(parent, nameof(parent)); @@ -1612,28 +1174,11 @@ public Tensor GetBufferedBatchTensor() } } - /// - public sealed class DnnEstimator : IEstimator + /// + public sealed class DnnRetrainEstimator : IEstimator { /// - /// Image classification model. - /// - public enum Architecture - { - ResnetV2101, - InceptionV3 - }; - - /// - /// Backend DNN training framework. - /// - public enum DnnFramework - { - Tensorflow - }; - - /// - /// The options for the . + /// The options for the . /// internal sealed class Options : TransformInputBase { @@ -1729,12 +1274,6 @@ internal sealed class Options : TransformInputBase [Argument(ArgumentType.AtMostOnce, HelpText = "Name of the input in TensorFlow graph that specifiy the location for saving/restoring models from disk.", SortOrder = 14)] public string SaveOperation = "save/control_dependency"; - /// - /// Needed for command line to specify if retraining is requested. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Retrain TensorFlow model.", SortOrder = 15)] - public bool ReTrain = false; - /// /// Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3]. /// @@ -1744,42 +1283,6 @@ internal sealed class Options : TransformInputBase /// [Argument(ArgumentType.AtMostOnce, HelpText = "Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3].", SortOrder = 16)] public bool AddBatchDimensionInputs = false; - - /// - /// Indicates if transfer learning is requested. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Transfer learning on a model.", SortOrder = 15)] - public bool TransferLearning = false; - - /// - /// Specifies the model architecture to be used in the case of image classification training using transfer learning. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Model architecture to be used in transfer learning for image classification.", SortOrder = 15)] - public Architecture Arch = Architecture.ResnetV2101; - - /// - /// Name of the tensor that will contain the output scores of the last layer when transfer learning is done. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Softmax tensor of the last layer in transfer learning.", SortOrder = 15)] - public string ScoreColumnName = "Scores"; - - /// - /// Name of the tensor that will contain the predicted label from output scores of the last layer when transfer learning is done. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Argmax tensor of the last layer in transfer learning.", SortOrder = 15)] - public string PredictedLabelColumnName = "PredictedLabel"; - - /// - /// Checkpoint folder to store graph files in the event of transfer learning. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Checkpoint folder to store graph files in the event of transfer learning.", SortOrder = 15)] - public string CheckpointName = "_retrain_checkpoint"; - - /// - /// Use train set to measure model accuracy between each epoch. - /// - [Argument(ArgumentType.AtMostOnce, HelpText = "Use train set to measure model accuracy between each epoch.", SortOrder = 15)] - public bool MeasureTrainAccuracy = false; } private readonly IHost _host; @@ -1787,25 +1290,16 @@ internal sealed class Options : TransformInputBase private readonly DnnModel _tensorFlowModel; private readonly TF_DataType[] _tfInputTypes; private readonly DataViewType[] _outputTypes; - private DnnTransformer _transformer; + private DnnRetrainTransformer _transformer; - internal DnnEstimator(IHostEnvironment env, Options options, DnnModel tensorFlowModel) + internal DnnRetrainEstimator(IHostEnvironment env, Options options, DnnModel tensorFlowModel) { - _host = Contracts.CheckRef(env, nameof(env)).Register(nameof(DnnEstimator)); + _host = Contracts.CheckRef(env, nameof(env)).Register(nameof(DnnRetrainEstimator)); _options = options; _tensorFlowModel = tensorFlowModel; - - if (options.TransferLearning) - _tfInputTypes = new[] { TF_DataType.TF_FLOAT }; - else - { - var inputTuple = DnnTransformer.GetInputInfo(_host, tensorFlowModel.Session, options.InputColumns); - _tfInputTypes = inputTuple.tfInputTypes; - } - if (options.TransferLearning) - _outputTypes = new[] { new VectorDataViewType(NumberDataViewType.Single), new VectorDataViewType(NumberDataViewType.Single, 1) }; - else - _outputTypes = DnnTransformer.GetOutputInfo(_host, tensorFlowModel.Session, options.OutputColumns).outputTypes; + var inputTuple = DnnRetrainTransformer.GetInputInfo(_host, tensorFlowModel.Session, options.InputColumns); + _tfInputTypes = inputTuple.tfInputTypes; + _outputTypes = DnnRetrainTransformer.GetOutputInfo(_host, tensorFlowModel.Session, options.OutputColumns).outputTypes; } private static Options CreateArguments(DnnModel tensorFlowModel, string[] outputColumnNames, string[] inputColumnName, bool addBatchDimensionInput) @@ -1814,7 +1308,6 @@ private static Options CreateArguments(DnnModel tensorFlowModel, string[] output options.ModelLocation = tensorFlowModel.ModelPath; options.InputColumns = inputColumnName; options.OutputColumns = outputColumnNames; - options.ReTrain = false; options.AddBatchDimensionInputs = addBatchDimensionInput; return options; } @@ -1849,13 +1342,13 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// Trains and returns a . + /// Trains and returns a . /// - public DnnTransformer Fit(IDataView input) + public DnnRetrainTransformer Fit(IDataView input) { _host.CheckValue(input, nameof(input)); if (_transformer == null) - _transformer = new DnnTransformer(_host, _options, _tensorFlowModel, input); + _transformer = new DnnRetrainTransformer(_host, _options, _tensorFlowModel, input); // Validate input schema. _transformer.GetOutputSchema(input.Schema); diff --git a/src/Microsoft.ML.Dnn/DnnUtils.cs b/src/Microsoft.ML.Dnn/DnnUtils.cs index 3aa727f41d..623e103997 100644 --- a/src/Microsoft.ML.Dnn/DnnUtils.cs +++ b/src/Microsoft.ML.Dnn/DnnUtils.cs @@ -381,6 +381,25 @@ public Runner AddInput(string input, Tensor value) return this; } + public Runner AddInput(string input) + { + _inputs.Add(ParseOutput(input)); + return this; + } + + public Runner AddInput(Tensor value, int index) + { + if (_inputValues.Count <= index) + _inputValues.Add(value); + else + { + _inputValues[index].Dispose(); + _inputValues[index] = value; + } + + return this; + } + public Runner AddOutputs(string output) { _outputs.Add(ParseOutput(output)); @@ -444,10 +463,6 @@ public Tensor[] Run() return result; } - public Runner CloneRunner() - { - return new Runner(_session); - } } } diff --git a/src/Microsoft.ML.Dnn/ImageClassificationTransform.cs b/src/Microsoft.ML.Dnn/ImageClassificationTransform.cs new file mode 100644 index 0000000000..4729731972 --- /dev/null +++ b/src/Microsoft.ML.Dnn/ImageClassificationTransform.cs @@ -0,0 +1,1222 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.InteropServices; +using Google.Protobuf; +using Microsoft.ML; +using Microsoft.ML.CommandLine; +using Microsoft.ML.Data; +using Microsoft.ML.Internal.Utilities; +using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.Dnn; +using Tensorflow; +using Tensorflow.Summaries; +using static Microsoft.ML.Data.TextLoader; +using static Microsoft.ML.Transforms.Dnn.DnnUtils; +using static Microsoft.ML.Transforms.ImageClassificationEstimator; +using static Tensorflow.Python; +using Architecture = Microsoft.ML.Transforms.ImageClassificationEstimator.Architecture; + +[assembly: LoadableClass(ImageClassificationTransformer.Summary, typeof(IDataTransform), typeof(ImageClassificationTransformer), + typeof(ImageClassificationEstimator.Options), typeof(SignatureDataTransform), ImageClassificationTransformer.UserName, ImageClassificationTransformer.ShortName)] + +[assembly: LoadableClass(ImageClassificationTransformer.Summary, typeof(IDataTransform), typeof(ImageClassificationTransformer), null, typeof(SignatureLoadDataTransform), + ImageClassificationTransformer.UserName, ImageClassificationTransformer.LoaderSignature)] + +[assembly: LoadableClass(typeof(ImageClassificationTransformer), null, typeof(SignatureLoadModel), + ImageClassificationTransformer.UserName, ImageClassificationTransformer.LoaderSignature)] + +[assembly: LoadableClass(typeof(IRowMapper), typeof(ImageClassificationTransformer), null, typeof(SignatureLoadRowMapper), + ImageClassificationTransformer.UserName, ImageClassificationTransformer.LoaderSignature)] + +namespace Microsoft.ML.Transforms +{ + /// + /// for the . + /// + public sealed class ImageClassificationTransformer : RowToRowTransformerBase + { + private readonly IHostEnvironment _env; + private readonly bool _addBatchDimensionInput; + private Session _session; + private Tensor _bottleneckTensor; + private Operation _trainStep; + private Tensor _softMaxTensor; + private Tensor _crossEntropy; + private Tensor _labelTensor; + private Tensor _evaluationStep; + private Tensor _prediction; + private Tensor _bottleneckInput; + private Tensor _jpegData; + private Tensor _resizedImage; + private string _jpegDataTensorName; + private string _resizedImageTensorName; + private string _inputTensorName; + private readonly int _classCount; + private readonly string _checkpointPath; + private readonly string _bottleneckOperationName; + private Graph Graph => _session.graph; + private readonly string[] _inputs; + private readonly string[] _outputs; + private readonly string _labelColumnName; + private readonly string _finalModelPrefix; + private readonly Architecture _arch; + private readonly string _scoreColumnName; + private readonly string _predictedLabelColumnName; + private readonly float _learningRate; + private readonly string _softmaxTensorName; + private readonly string _predictionTensorName; + internal const string Summary = "Trains Dnn models."; + internal const string UserName = "ImageClassificationTransform"; + internal const string ShortName = "ImgClsTrans"; + internal const string LoaderSignature = "ImageClassificationTrans"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "IMGTRANS", + //verWrittenCur: 0x00010001, // Initial + verWrittenCur: 0x00000001, + verReadableCur: 0x00000001, + verWeCanReadBack: 0x00000001, + loaderSignature: LoaderSignature, + loaderAssemblyName: typeof(ImageClassificationTransformer).Assembly.FullName); + } + + // Factory method for SignatureLoadModel. + private static ImageClassificationTransformer Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + + // *** Binary format *** + // byte: indicator for frozen models + // byte: indicator for adding batch dimension in input + // int: number of input columns + // for each input column + // int: id of int column name + // int: number of output columns + // for each output column + // int: id of output column name + // stream: tensorFlow model. + + GetModelInfo(env, ctx, out string[] inputs, out string[] outputs, out bool addBatchDimensionInput, + out string labelColumn, out string checkpointName, out Architecture arch, out string scoreColumnName, + out string predictedColumnName, out float learningRate, out int classCount, out string predictionTensorName, out string softMaxTensorName, + out string jpegDataTensorName, out string resizeTensorName); + + byte[] modelBytes = null; + if (!ctx.TryLoadBinaryStream("TFModel", r => modelBytes = r.ReadByteArray())) + throw env.ExceptDecode(); + + return new ImageClassificationTransformer(env, DnnUtils.LoadTFSession(env, modelBytes), outputs, inputs, + null, addBatchDimensionInput, 1, labelColumn, checkpointName, arch, + scoreColumnName, predictedColumnName, learningRate, null, classCount, true, predictionTensorName, + softMaxTensorName, jpegDataTensorName, resizeTensorName); + + } + + // Factory method for SignatureDataTransform. + internal static IDataTransform Create(IHostEnvironment env, ImageClassificationEstimator.Options options, IDataView input) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(options, nameof(options)); + env.CheckValue(input, nameof(input)); + env.CheckValue(options.InputColumns, nameof(options.InputColumns)); + env.CheckValue(options.OutputColumns, nameof(options.OutputColumns)); + + return new ImageClassificationTransformer(env, options, input).MakeDataTransform(input); + } + + internal ImageClassificationTransformer(IHostEnvironment env, ImageClassificationEstimator.Options options, IDataView input) + : this(env, options, DnnUtils.LoadDnnModel(env, options.ModelLocation), input) + { + } + + internal ImageClassificationTransformer(IHostEnvironment env, ImageClassificationEstimator.Options options, DnnModel tensorFlowModel, IDataView input) + : this(env, tensorFlowModel.Session, options.OutputColumns, options.InputColumns, + options.ModelLocation, null, options.BatchSize, + options.LabelColumn, options.FinalModelPrefix, options.Arch, options.ScoreColumnName, + options.PredictedLabelColumnName, options.LearningRate, input.Schema) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(options, nameof(options)); + env.CheckValue(input, nameof(input)); + CheckTrainingParameters(options); + var imageProcessor = new ImageProcessor(this); + if (!options.ReuseTrainSetBottleneckCachedValues || !File.Exists(options.TrainSetBottleneckCachedValuesFilePath)) + CacheFeaturizedImagesToDisk(input, options.LabelColumn, options.InputColumns[0], imageProcessor, + _inputTensorName, _bottleneckTensor.name, options.TrainSetBottleneckCachedValuesFilePath, + ImageClassificationMetrics.Dataset.Train, options.MetricsCallback); + + if (options.ValidationSet != null && + (!options.ReuseTrainSetBottleneckCachedValues || !File.Exists(options.ValidationSetBottleneckCachedValuesFilePath))) + CacheFeaturizedImagesToDisk(options.ValidationSet, options.LabelColumn, options.InputColumns[0], + imageProcessor, _inputTensorName, _bottleneckTensor.name, options.ValidationSetBottleneckCachedValuesFilePath, + ImageClassificationMetrics.Dataset.Validation, options.MetricsCallback); + + TrainAndEvaluateClassificationLayer(options.TrainSetBottleneckCachedValuesFilePath, options, options.ValidationSetBottleneckCachedValuesFilePath); + } + + private void CheckTrainingParameters(ImageClassificationEstimator.Options options) + { + Host.CheckNonWhiteSpace(options.LabelColumn, nameof(options.LabelColumn)); + Host.CheckNonWhiteSpace(options.TensorFlowLabel, nameof(options.TensorFlowLabel)); + + if (_session.graph.OperationByName(options.TensorFlowLabel) == null) + throw Host.ExceptParam(nameof(options.TensorFlowLabel), $"'{options.TensorFlowLabel}' does not exist in the model"); + } + + private (Tensor, Tensor) AddJpegDecoding(int height, int width, int depth) + { + // height, width, depth + var inputDim = (height, width, depth); + var jpegData = tf.placeholder(tf.@string, name: "DecodeJPGInput"); + var decodedImage = tf.image.decode_jpeg(jpegData, channels: inputDim.Item3); + // Convert from full range of uint8 to range [0,1] of float32. + var decodedImageAsFloat = tf.image.convert_image_dtype(decodedImage, tf.float32); + var decodedImage4d = tf.expand_dims(decodedImageAsFloat, 0); + var resizeShape = tf.stack(new int[] { inputDim.Item1, inputDim.Item2 }); + var resizeShapeAsInt = tf.cast(resizeShape, dtype: tf.int32); + var resizedImage = tf.image.resize_bilinear(decodedImage4d, resizeShapeAsInt, false, "ResizeTensor"); + return (jpegData, resizedImage); + } + + private sealed class ImageProcessor + { + private Runner _imagePreprocessingRunner; + + public ImageProcessor(ImageClassificationTransformer transformer) + { + _imagePreprocessingRunner = new Runner(transformer._session); + _imagePreprocessingRunner.AddInput(transformer._jpegDataTensorName); + _imagePreprocessingRunner.AddOutputs(transformer._resizedImageTensorName); + } + + public Tensor ProcessImage(string path) + { + var imageTensor = new Tensor(File.ReadAllBytes(path), TF_DataType.TF_STRING); + var processedTensor = _imagePreprocessingRunner.AddInput(imageTensor, 0).Run()[0]; + imageTensor.Dispose(); + return processedTensor; + } + } + + private void CacheFeaturizedImagesToDisk(IDataView input, string labelColumnName, string imagepathColumnName, + ImageProcessor imageProcessor, string inputTensorName, string outputTensorName, string cacheFilePath, + ImageClassificationMetrics.Dataset dataset, ImageClassificationMetricsCallback metricsCallback) + { + var labelColumn = input.Schema[labelColumnName]; + + if (labelColumn.Type.RawType != typeof(UInt32)) + throw Host.ExceptSchemaMismatch(nameof(labelColumn), "Label", + labelColumnName, typeof(uint).ToString(), + labelColumn.Type.RawType.ToString()); + + var imagePathColumn = input.Schema[imagepathColumnName]; + Runner runner = new Runner(_session); + runner.AddOutputs(outputTensorName); + + using (TextWriter writer = File.CreateText(cacheFilePath)) + using (var cursor = input.GetRowCursor(input.Schema.Where(c => c.Index == labelColumn.Index || c.Index == imagePathColumn.Index))) + { + var labelGetter = cursor.GetGetter(labelColumn); + var imagePathGetter = cursor.GetGetter>(imagePathColumn); + UInt32 label = UInt32.MaxValue; + ReadOnlyMemory imagePath = default; + runner.AddInput(inputTensorName); + ImageClassificationMetrics metrics = new ImageClassificationMetrics(); + metrics.Bottleneck = new BottleneckMetrics(); + metrics.Bottleneck.DatasetUsed = dataset; + while (cursor.MoveNext()) + { + labelGetter(ref label); + imagePathGetter(ref imagePath); + var imagePathStr = imagePath.ToString(); + var imageTensor = imageProcessor.ProcessImage(imagePathStr); + runner.AddInput(imageTensor, 0); + var featurizedImage = runner.Run()[0]; // Reuse memory? + writer.WriteLine(label - 1 + "," + string.Join(",", featurizedImage.Data())); + featurizedImage.Dispose(); + imageTensor.Dispose(); + metrics.Bottleneck.Index++; + metrics.Bottleneck.Name = imagePathStr; + metricsCallback?.Invoke(metrics); + } + } + } + + private IDataView GetShuffledData(string path) + { + return new RowShufflingTransformer( + _env, + new RowShufflingTransformer.Options + { + ForceShuffle = true, + ForceShuffleSource = true + }, + new TextLoader( + _env, + new TextLoader.Options + { + Separators = new[] { ',' }, + Columns = new[] + { + new Column("Label", DataKind.Int64, 0), + new Column("Features", DataKind.Single, new [] { new Range(1, null) }), + }, + }, + new MultiFileSource(path)) + .Load(new MultiFileSource(path))); + } + + private void TrainAndEvaluateClassificationLayer(string trainBottleneckFilePath, ImageClassificationEstimator.Options options, + string validationSetBottleneckFilePath) + { + int batchSize = options.BatchSize; + int epochs = options.Epoch; + bool evaluateOnly = !string.IsNullOrEmpty(validationSetBottleneckFilePath); + ImageClassificationMetricsCallback statisticsCallback = options.MetricsCallback; + var trainingSet = GetShuffledData(trainBottleneckFilePath); + IDataView validationSet = null; + if (options.ValidationSet != null && !string.IsNullOrEmpty(validationSetBottleneckFilePath)) + validationSet = GetShuffledData(validationSetBottleneckFilePath); + + long label = long.MaxValue; + VBuffer features = default; + ReadOnlySpan featureValues = default; + var featureColumn = trainingSet.Schema[1]; + int featureLength = featureColumn.Type.GetVectorSize(); + float[] featureBatch = new float[featureLength * batchSize]; + var featureBatchHandle = GCHandle.Alloc(featureBatch, GCHandleType.Pinned); + IntPtr featureBatchPtr = featureBatchHandle.AddrOfPinnedObject(); + int featureBatchSizeInBytes = sizeof(float) * featureBatch.Length; + long[] labelBatch = new long[batchSize]; + var labelBatchHandle = GCHandle.Alloc(labelBatch, GCHandleType.Pinned); + IntPtr labelBatchPtr = labelBatchHandle.AddrOfPinnedObject(); + int labelBatchSizeInBytes = sizeof(long) * labelBatch.Length; + var labelTensorShape = _labelTensor.TensorShape.dims.Select(x => (long)x).ToArray(); + labelTensorShape[0] = batchSize; + int batchIndex = 0; + var runner = new Runner(_session); + var testEvalRunner = new Runner(_session); + testEvalRunner.AddOutputs(_evaluationStep.name); + testEvalRunner.AddOutputs(_crossEntropy.name); + + Runner validationEvalRunner = null; + if (validationSet != null) + { + validationEvalRunner = new Runner(_session); + validationEvalRunner.AddOutputs(_evaluationStep.name); + validationEvalRunner.AddInput(_bottleneckInput.name).AddInput(_labelTensor.name); + } + + runner.AddOperation(_trainStep); + var featureTensorShape = _bottleneckInput.TensorShape.dims.Select(x => (long)x).ToArray(); + featureTensorShape[0] = batchSize; + + Saver trainSaver = null; + FileWriter trainWriter = null; + Tensor merged = tf.summary.merge_all(); + trainWriter = tf.summary.FileWriter(Path.Combine(Directory.GetCurrentDirectory(), "train"), _session.graph); + trainSaver = tf.train.Saver(); + trainSaver.save(_session, _checkpointPath); + + runner.AddInput(_bottleneckInput.name).AddInput(_labelTensor.name); + testEvalRunner.AddInput(_bottleneckInput.name).AddInput(_labelTensor.name); + Dictionary classStatsTrain = new Dictionary(); + Dictionary classStatsValidate = new Dictionary(); + for (int index = 0; index < _classCount; index += 1) + classStatsTrain[index] = classStatsValidate[index] = 0; + + ImageClassificationMetrics metrics = new ImageClassificationMetrics(); + metrics.Train = new TrainMetrics(); + for (int epoch = 0; epoch < epochs; epoch += 1) + { + metrics.Train.Accuracy = 0; + metrics.Train.CrossEntropy = 0; + metrics.Train.BatchProcessedCount = 0; + using (var cursor = trainingSet.GetRowCursor(trainingSet.Schema.ToArray(), new Random())) + { + var labelGetter = cursor.GetGetter(trainingSet.Schema[0]); + var featuresGetter = cursor.GetGetter>(featureColumn); + while (cursor.MoveNext()) + { + labelGetter(ref label); + featuresGetter(ref features); + classStatsTrain[label]++; + + if (featureValues == default) + featureValues = features.GetValues(); + + // Buffer the values. + for (int index = 0; index < featureLength; index += 1) + featureBatch[batchIndex * featureLength + index] = featureValues[index]; + + labelBatch[batchIndex] = label; + batchIndex += 1; + // Train. + if (batchIndex == batchSize) + { + runner.AddInput(new Tensor(featureBatchPtr, featureTensorShape, TF_DataType.TF_FLOAT, featureBatchSizeInBytes), 0) + .AddInput(new Tensor(labelBatchPtr, labelTensorShape, TF_DataType.TF_INT64, labelBatchSizeInBytes), 1) + .Run(); + + metrics.Train.BatchProcessedCount += 1; + + if (options.TestOnTrainSet && statisticsCallback != null) + { + var outputTensors = testEvalRunner + .AddInput(new Tensor(featureBatchPtr, featureTensorShape, TF_DataType.TF_FLOAT, featureBatchSizeInBytes), 0) + .AddInput(new Tensor(labelBatchPtr, labelTensorShape, TF_DataType.TF_INT64, labelBatchSizeInBytes), 1) + .Run(); + + metrics.Train.Accuracy += outputTensors[0].Data()[0]; + metrics.Train.CrossEntropy += outputTensors[1].Data()[0]; + + outputTensors[0].Dispose(); + outputTensors[1].Dispose(); + } + + batchIndex = 0; + } + } + + if (options.TestOnTrainSet && statisticsCallback != null) + { + metrics.Train.Epoch = epoch; + metrics.Train.Accuracy /= metrics.Train.BatchProcessedCount; + metrics.Train.CrossEntropy /= metrics.Train.BatchProcessedCount; + metrics.Train.DatasetUsed = ImageClassificationMetrics.Dataset.Train; + statisticsCallback(metrics); + } + } + + if (validationSet == null) + continue; + + batchIndex = 0; + metrics.Train.BatchProcessedCount = 0; + metrics.Train.Accuracy = 0; + metrics.Train.CrossEntropy = 0; + using (var cursor = validationSet.GetRowCursor(validationSet.Schema.ToArray(), new Random())) + { + var labelGetter = cursor.GetGetter(validationSet.Schema[0]); + var featuresGetter = cursor.GetGetter>(featureColumn); + while (cursor.MoveNext()) + { + labelGetter(ref label); + featuresGetter(ref features); + classStatsValidate[label]++; + // Buffer the values. + for (int index = 0; index < featureLength; index += 1) + featureBatch[batchIndex * featureLength + index] = featureValues[index]; + + labelBatch[batchIndex] = label; + batchIndex += 1; + // Evaluate. + if (batchIndex == batchSize) + { + var outputTensors = validationEvalRunner + .AddInput(new Tensor(featureBatchPtr, featureTensorShape, TF_DataType.TF_FLOAT, featureBatchSizeInBytes), 0) + .AddInput(new Tensor(labelBatchPtr, labelTensorShape, TF_DataType.TF_INT64, labelBatchSizeInBytes), 1) + .Run(); + + metrics.Train.Accuracy += outputTensors[0].Data()[0]; + metrics.Train.BatchProcessedCount += 1; + batchIndex = 0; + + outputTensors[0].Dispose(); + } + } + + if (statisticsCallback != null) + { + metrics.Train.Epoch = epoch; + metrics.Train.Accuracy /= metrics.Train.BatchProcessedCount; + metrics.Train.DatasetUsed = ImageClassificationMetrics.Dataset.Validation; + statisticsCallback(metrics); + } + } + } + + trainSaver.save(_session, _checkpointPath); + UpdateTransferLearningModelOnDisk(options, _classCount); + } + + private (Session, Tensor, Tensor, Tensor) BuildEvaluationSession(ImageClassificationEstimator.Options options, int classCount) + { + var evalGraph = DnnUtils.LoadMetaGraph(options.ModelLocation); + var evalSess = tf.Session(graph: evalGraph); + Tensor evaluationStep = null; + Tensor prediction = null; + Tensor bottleneckTensor = evalGraph.OperationByName(_bottleneckOperationName); + + tf_with(evalGraph.as_default(), graph => + { + var (_, _, groundTruthInput, finalTensor) = AddFinalRetrainOps(classCount, options.LabelColumn, + options.ScoreColumnName, options.LearningRate, bottleneckTensor, false); + + tf.train.Saver().restore(evalSess, _checkpointPath); + (evaluationStep, prediction) = AddEvaluationStep(finalTensor, groundTruthInput); + (_jpegData, _resizedImage) = AddJpegDecoding(299, 299, 3); + }); + + return (evalSess, _labelTensor, evaluationStep, prediction); + } + + private (Tensor, Tensor) AddEvaluationStep(Tensor resultTensor, Tensor groundTruthTensor) + { + Tensor evaluationStep = null; + Tensor correctPrediction = null; + + tf_with(tf.name_scope("accuracy"), scope => + { + tf_with(tf.name_scope("correct_prediction"), delegate + { + _prediction = tf.argmax(resultTensor, 1); + correctPrediction = tf.equal(_prediction, groundTruthTensor); + }); + + tf_with(tf.name_scope("accuracy"), delegate + { + evaluationStep = tf.reduce_mean(tf.cast(correctPrediction, tf.float32)); + }); + }); + + tf.summary.scalar("accuracy", evaluationStep); + return (evaluationStep, _prediction); + } + + private void UpdateTransferLearningModelOnDisk(ImageClassificationEstimator.Options options, int classCount) + { + var (sess, _, _, _) = BuildEvaluationSession(options, classCount); + var graph = sess.graph; + var outputGraphDef = tf.graph_util.convert_variables_to_constants( + sess, graph.as_graph_def(), new string[] { _softMaxTensor.name.Split(':')[0], _prediction.name.Split(':')[0], _jpegData.name.Split(':')[0], _resizedImage.name.Split(':')[0] }); + + string frozenModelPath = _checkpointPath + ".pb"; + File.WriteAllBytes(_checkpointPath + ".pb", outputGraphDef.ToByteArray()); + _session.graph.Dispose(); + _session.Dispose(); + _session = LoadTFSessionByModelFilePath(_env, frozenModelPath, false); + } + + private void VariableSummaries(RefVariable var) + { + tf_with(tf.name_scope("summaries"), delegate + { + var mean = tf.reduce_mean(var); + tf.summary.scalar("mean", mean); + Tensor stddev = null; + tf_with(tf.name_scope("stddev"), delegate + { + stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))); + }); + tf.summary.scalar("stddev", stddev); + tf.summary.scalar("max", tf.reduce_max(var)); + tf.summary.scalar("min", tf.reduce_min(var)); + tf.summary.histogram("histogram", var); + }); + } + + private (Operation, Tensor, Tensor, Tensor) AddFinalRetrainOps(int classCount, string labelColumn, + string scoreColumnName, float learningRate, Tensor bottleneckTensor, bool isTraining) + { + var (batch_size, bottleneck_tensor_size) = (bottleneckTensor.TensorShape.Dimensions[0], bottleneckTensor.TensorShape.Dimensions[1]); + tf_with(tf.name_scope("input"), scope => + { + if (isTraining) + { + _bottleneckInput = tf.placeholder_with_default( + bottleneckTensor, + shape: bottleneckTensor.TensorShape.Dimensions, + name: "BottleneckInputPlaceholder"); + } + + _labelTensor = tf.placeholder(tf.int64, new TensorShape(batch_size), name: labelColumn); + }); + + string layerName = "final_retrain_ops"; + Tensor logits = null; + tf_with(tf.name_scope(layerName), scope => + { + RefVariable layerWeights = null; + tf_with(tf.name_scope("weights"), delegate + { + var initialValue = tf.truncated_normal(new int[] { bottleneck_tensor_size, classCount }, stddev: 0.001f); + layerWeights = tf.Variable(initialValue, name: "final_weights"); + VariableSummaries(layerWeights); + }); + + RefVariable layerBiases = null; + tf_with(tf.name_scope("biases"), delegate + { + layerBiases = tf.Variable(tf.zeros(classCount), name: "final_biases"); + VariableSummaries(layerBiases); + }); + + tf_with(tf.name_scope("Wx_plus_b"), delegate + { + var matmul = tf.matmul(isTraining ? _bottleneckInput : bottleneckTensor, layerWeights); + logits = matmul + layerBiases; + tf.summary.histogram("pre_activations", logits); + }); + }); + + _softMaxTensor = tf.nn.softmax(logits, name: scoreColumnName); + + tf.summary.histogram("activations", _softMaxTensor); + if (!isTraining) + return (null, null, _labelTensor, _softMaxTensor); + + Tensor crossEntropyMean = null; + tf_with(tf.name_scope("cross_entropy"), delegate + { + crossEntropyMean = tf.losses.sparse_softmax_cross_entropy( + labels: _labelTensor, logits: logits); + }); + + tf.summary.scalar("cross_entropy", crossEntropyMean); + + tf_with(tf.name_scope("train"), delegate + { + var optimizer = tf.train.GradientDescentOptimizer(learningRate); + _trainStep = optimizer.minimize(crossEntropyMean); + }); + + return (_trainStep, crossEntropyMean, _labelTensor, _softMaxTensor); + } + + private void AddTransferLearningLayer(string labelColumn, + string scoreColumnName, float learningRate, int classCount) + { + _bottleneckTensor = Graph.OperationByName(_bottleneckOperationName); + tf_with(Graph.as_default(), delegate + { + (_trainStep, _crossEntropy, _labelTensor, _softMaxTensor) = + AddFinalRetrainOps(classCount, labelColumn, scoreColumnName, learningRate, _bottleneckTensor, true); + }); + } + + // Factory method for SignatureLoadDataTransform. + private static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + => Create(env, ctx).MakeDataTransform(input); + + // Factory method for SignatureLoadRowMapper. + private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, DataViewSchema inputSchema) + => Create(env, ctx).MakeRowMapper(inputSchema); + + private static void GetModelInfo(IHostEnvironment env, ModelLoadContext ctx, out string[] inputs, + out string[] outputs, out bool addBatchDimensionInput, + out string labelColumn, out string checkpointName, out Architecture arch, + out string scoreColumnName, out string predictedColumnName, out float learningRate, out int classCount, out string predictionTensorName, out string softMaxTensorName, + out string jpegDataTensorName, out string resizeTensorName) + { + addBatchDimensionInput = ctx.Reader.ReadBoolByte(); + + var numInputs = ctx.Reader.ReadInt32(); + env.CheckDecode(numInputs > 0); + inputs = new string[numInputs]; + for (int j = 0; j < inputs.Length; j++) + inputs[j] = ctx.LoadNonEmptyString(); + + var numOutputs = ctx.Reader.ReadInt32(); + env.CheckDecode(numOutputs > 0); + outputs = new string[numOutputs]; + for (int j = 0; j < outputs.Length; j++) + outputs[j] = ctx.LoadNonEmptyString(); + + labelColumn = ctx.Reader.ReadString(); + checkpointName = ctx.Reader.ReadString(); + arch = (Architecture)ctx.Reader.ReadInt32(); + scoreColumnName = ctx.Reader.ReadString(); + predictedColumnName = ctx.Reader.ReadString(); + learningRate = ctx.Reader.ReadFloat(); + classCount = ctx.Reader.ReadInt32(); + predictionTensorName = ctx.Reader.ReadString(); + softMaxTensorName = ctx.Reader.ReadString(); + jpegDataTensorName = ctx.Reader.ReadString(); + resizeTensorName = ctx.Reader.ReadString(); + } + + internal ImageClassificationTransformer(IHostEnvironment env, Session session, string[] outputColumnNames, + string[] inputColumnNames, string modelLocation, + bool? addBatchDimensionInput, int batchSize, string labelColumnName, string finalModelPrefix, Architecture arch, + string scoreColumnName, string predictedLabelColumnName, float learningRate, DataViewSchema inputSchema, int? classCount = null, bool loadModel = false, + string predictionTensorName = null, string softMaxTensorName = null, string jpegDataTensorName = null, string resizeTensorName = null) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ImageClassificationTransformer))) + + { + Host.CheckValue(session, nameof(session)); + Host.CheckNonEmpty(inputColumnNames, nameof(inputColumnNames)); + Host.CheckNonEmpty(outputColumnNames, nameof(outputColumnNames)); + + _env = env; + _session = session; + _addBatchDimensionInput = addBatchDimensionInput ?? arch == Architecture.ResnetV2101; + _inputs = inputColumnNames; + _outputs = outputColumnNames; + _labelColumnName = labelColumnName; + _finalModelPrefix = finalModelPrefix; + _arch = arch; + _scoreColumnName = scoreColumnName; + _predictedLabelColumnName = predictedLabelColumnName; + _learningRate = learningRate; + _softmaxTensorName = softMaxTensorName; + _predictionTensorName = predictionTensorName; + _jpegDataTensorName = jpegDataTensorName; + _resizedImageTensorName = resizeTensorName; + + if (classCount == null) + { + var labelColumn = inputSchema.GetColumnOrNull(labelColumnName).Value; + var labelType = labelColumn.Type; + var labelCount = labelType.GetKeyCount(); + if (labelCount <= 0) + throw Host.ExceptSchemaMismatch(nameof(inputSchema), "label", (string)labelColumn.Name, "Key", (string)labelType.ToString()); + + _classCount = labelCount == 1 ? 2 : (int)labelCount; + } + else + _classCount = classCount.Value; + + _checkpointPath = Path.Combine(Directory.GetCurrentDirectory(), finalModelPrefix + modelLocation); + + // Configure bottleneck tensor based on the model. + if (arch == ImageClassificationEstimator.Architecture.ResnetV2101) + { + _bottleneckOperationName = "resnet_v2_101/SpatialSqueeze"; + _inputTensorName = "input"; + } + else if (arch == ImageClassificationEstimator.Architecture.InceptionV3) + { + _bottleneckOperationName = "module_apply_default/hub_output/feature_vector/SpatialSqueeze"; + _inputTensorName = "Placeholder"; + } + + _outputs = new[] { scoreColumnName, predictedLabelColumnName }; + + if (loadModel == false) + { + (_jpegData, _resizedImage) = AddJpegDecoding(299, 299, 3); + _jpegDataTensorName = _jpegData.name; + _resizedImageTensorName = _resizedImage.name; + + // Add transfer learning layer. + AddTransferLearningLayer(labelColumnName, scoreColumnName, learningRate, _classCount); + + // Initialize the variables. + new Runner(_session).AddOperation(tf.global_variables_initializer()).Run(); + + // Add evaluation layer. + (_evaluationStep, _) = AddEvaluationStep(_softMaxTensor, _labelTensor); + _softmaxTensorName = _softMaxTensor.name; + _predictionTensorName = _prediction.name; + } + } + + private protected override IRowMapper MakeRowMapper(DataViewSchema inputSchema) => new Mapper(this, inputSchema); + + private protected override void SaveModel(ModelSaveContext ctx) + { + Host.AssertValue(ctx); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + ctx.Writer.WriteBoolByte(_addBatchDimensionInput); + + Host.AssertNonEmpty(_inputs); + ctx.Writer.Write(_inputs.Length); + foreach (var colName in _inputs) + ctx.SaveNonEmptyString(colName); + + Host.AssertNonEmpty(_outputs); + ctx.Writer.Write(_outputs.Length); + foreach (var colName in _outputs) + ctx.SaveNonEmptyString(colName); + + ctx.Writer.Write(_labelColumnName); + ctx.Writer.Write(_finalModelPrefix); + ctx.Writer.Write((int)_arch); + ctx.Writer.Write(_scoreColumnName); + ctx.Writer.Write(_predictedLabelColumnName); + ctx.Writer.Write(_learningRate); + ctx.Writer.Write(_classCount); + ctx.Writer.Write(_predictionTensorName); + ctx.Writer.Write(_softmaxTensorName); + ctx.Writer.Write(_jpegDataTensorName); + ctx.Writer.Write(_resizedImageTensorName); + Status status = new Status(); + var buffer = _session.graph.ToGraphDef(status); + ctx.SaveBinaryStream("TFModel", w => + { + w.WriteByteArray(buffer.Data); + }); + status.Check(true); + } + + ~ImageClassificationTransformer() + { + Dispose(false); + } + + private void Dispose(bool disposing) + { + // Ensure that the Session is not null and it's handle is not Zero, as it may have already been disposed/finalized. + // Technically we shouldn't be calling this if disposing == false, since we're running in finalizer + // and the GC doesn't guarantee ordering of finalization of managed objects, but we have to make sure + // that the Session is closed before deleting our temporary directory. + if (_session != null && _session != IntPtr.Zero) + { + _session.close(); + } + } + + private sealed class Mapper : MapperBase + { + private readonly ImageClassificationTransformer _parent; + private readonly int[] _inputColIndices; + + public Mapper(ImageClassificationTransformer parent, DataViewSchema inputSchema) : + base(Contracts.CheckRef(parent, nameof(parent)).Host.Register(nameof(Mapper)), inputSchema, parent) + { + Host.CheckValue(parent, nameof(parent)); + _parent = parent; + _inputColIndices = new int[1]; + if (!inputSchema.TryGetColumnIndex(_parent._inputs[0], out _inputColIndices[0])) + throw Host.ExceptSchemaMismatch(nameof(InputSchema), "source", _parent._inputs[0]); + } + + private protected override void SaveModel(ModelSaveContext ctx) => _parent.SaveModel(ctx); + + private class OutputCache + { + public long Position; + private ValueGetter> _imagePathGetter; + private ReadOnlyMemory _imagePath; + private Runner _runner; + private ImageProcessor _imageProcessor; + public UInt32 PredictedLabel { get; set; } + public float[] ClassProbabilities { get; set; } + private DataViewRow _inputRow; + + public OutputCache(DataViewRow input, ImageClassificationTransformer transformer) + { + _imagePath = default; + _imagePathGetter = input.GetGetter>(input.Schema[transformer._inputs[0]]); + _runner = new Runner(transformer._session); + _runner.AddInput(transformer._inputTensorName); + _runner.AddOutputs(transformer._softmaxTensorName); + _runner.AddOutputs(transformer._predictionTensorName); + _imageProcessor = new ImageProcessor(transformer); + _inputRow = input; + Position = -1; + } + + public void UpdateCacheIfNeeded() + { + lock (this) + { + if (_inputRow.Position != Position) + { + Position = _inputRow.Position; + _imagePathGetter(ref _imagePath); + var processedTensor = _imageProcessor.ProcessImage(_imagePath.ToString()); + var outputTensor = _runner.AddInput(processedTensor, 0).Run(); + ClassProbabilities = outputTensor[0].Data(); + PredictedLabel = (UInt32)outputTensor[1].Data()[0]; + outputTensor[0].Dispose(); + outputTensor[1].Dispose(); + processedTensor.Dispose(); + } + } + } + } + + protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func activeOutput, out Action disposer) + { + disposer = null; + Host.AssertValue(input); + var cache = new OutputCache(input, _parent); + + if (iinfo == 0) + { + ValueGetter> valuegetter = (ref VBuffer dst) => + { + cache.UpdateCacheIfNeeded(); + var editor = VBufferEditor.Create(ref dst, cache.ClassProbabilities.Length); + new Span(cache.ClassProbabilities, 0, cache.ClassProbabilities.Length).CopyTo(editor.Values); + dst = editor.Commit(); + }; + return valuegetter; + } + else + { + ValueGetter valuegetter = (ref UInt32 dst) => + { + cache.UpdateCacheIfNeeded(); + dst = cache.PredictedLabel; + }; + + return valuegetter; + } + } + + private protected override Func GetDependenciesCore(Func activeOutput) + { + return col => Enumerable.Range(0, _parent._outputs.Length).Any(i => activeOutput(i)) && _inputColIndices.Any(i => i == col); + } + + protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() + { + var info = new DataViewSchema.DetachedColumn[_parent._outputs.Length]; + info[0] = new DataViewSchema.DetachedColumn(_parent._outputs[0], new VectorDataViewType(NumberDataViewType.Single, _parent._classCount), null); + info[1] = new DataViewSchema.DetachedColumn(_parent._outputs[1], NumberDataViewType.UInt32, null); + return info; + } + } + } + + /// + public sealed class ImageClassificationEstimator : IEstimator + { + /// + /// Image classification model. + /// + public enum Architecture + { + ResnetV2101, + InceptionV3 + }; + + /// + /// Backend DNN training framework. + /// + public enum DnnFramework + { + Tensorflow + }; + + /// + /// Callback that returns DNN statistics during training phase. + /// + public delegate void ImageClassificationMetricsCallback(ImageClassificationMetrics metrics); + + /// + /// DNN training metrics. + /// + public sealed class TrainMetrics + { + /// + /// Indicates the dataset on which metrics are being reported. + /// + /// + public ImageClassificationMetrics.Dataset DatasetUsed { get; set; } + + /// + /// The number of batches processed in an epoch. + /// + public int BatchProcessedCount { get; set; } + + /// + /// The training epoch index for which this metric is reported. + /// + public int Epoch { get; set; } + + /// + /// Accuracy of the batch on this . Higher the better. + /// + public float Accuracy { get; set; } + + /// + /// Cross-Entropy (loss) of the batch on this . Lower + /// the better. + /// + public float CrossEntropy { get; set; } + + /// + /// String representation of the metrics. + /// + public override string ToString() + { + if (DatasetUsed == ImageClassificationMetrics.Dataset.Train) + return $"Phase: Training, Dataset used: {DatasetUsed.ToString(),10}, Batch Processed Count: {BatchProcessedCount,3}, " + + $"Epoch: {Epoch,3}, Accuracy: {Accuracy,10}, Cross-Entropy: {CrossEntropy,10}"; + else + return $"Phase: Training, Dataset used: {DatasetUsed.ToString(),10}, Batch Processed Count: {BatchProcessedCount,3}, " + + $"Epoch: {Epoch,3}, Accuracy: {Accuracy,10}"; + } + } + + /// + /// Metrics for image featurization values. The input image is passed through + /// the network and features are extracted from second or last layer to + /// train a custom full connected layer that serves as classifier. + /// + public sealed class BottleneckMetrics + { + /// + /// Indicates the dataset on which metrics are being reported. + /// + /// + public ImageClassificationMetrics.Dataset DatasetUsed { get; set; } + + /// + /// Name of the input image. + /// + public string Name { get; set; } + + /// + /// Index of the input image. + /// + public int Index { get; set; } + + /// + /// String representation of the metrics. + /// + public override string ToString() => $"Phase: Bottleneck Computation, Dataset used: {DatasetUsed.ToString(),10}, Image Index: {Index,3}, Image Name: {Name}"; + } + + /// + /// Metrics for image classification training. + /// + public sealed class ImageClassificationMetrics + { + /// + /// Indicates the kind of the dataset of which metric is reported. + /// + public enum Dataset + { + Train, + Validation + }; + + /// + /// Contains train time metrics. + /// + public TrainMetrics Train { get; set; } + + /// + /// Contains pre-train time metrics. These contains metrics on image + /// featurization. + /// + public BottleneckMetrics Bottleneck { get; set; } + + /// + /// String representation of the metrics. + /// + public override string ToString() => Train != null ? Train.ToString() : Bottleneck.ToString(); + } + + /// + /// The options for the . + /// + internal sealed class Options : TransformInputBase + { + /// + /// Location of the TensorFlow model. + /// + [Argument(ArgumentType.Required, HelpText = "TensorFlow model used by the transform. Please see https://www.tensorflow.org/mobile/prepare_models for more details.", SortOrder = 0)] + public string ModelLocation; + + /// + /// The names of the model inputs. + /// + [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "The names of the model inputs", ShortName = "inputs", SortOrder = 1)] + public string[] InputColumns; + + /// + /// The names of the requested model outputs. + /// + [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "The name of the outputs", ShortName = "outputs", SortOrder = 2)] + public string[] OutputColumns; + + /// + /// The name of the label column in that will be mapped to label node in TensorFlow model. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Training labels.", ShortName = "label", SortOrder = 4)] + public string LabelColumn; + + /// + /// The name of the label in TensorFlow model. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "TensorFlow label node.", ShortName = "TFLabel", SortOrder = 5)] + public string TensorFlowLabel; + + /// + /// Number of samples to use for mini-batch training. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Number of samples to use for mini-batch training.", SortOrder = 9)] + public int BatchSize = 64; + + /// + /// Number of training iterations. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Number of training iterations.", SortOrder = 10)] + public int Epoch = 5; + + /// + /// Learning rate to use during optimization. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Learning rate to use during optimization.", SortOrder = 12)] + public float LearningRate = 0.01f; + + /// + /// Specifies the model architecture to be used in the case of image classification training using transfer learning. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Model architecture to be used in transfer learning for image classification.", SortOrder = 15)] + public Architecture Arch = Architecture.InceptionV3; + + /// + /// Name of the tensor that will contain the output scores of the last layer when transfer learning is done. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Softmax tensor of the last layer in transfer learning.", SortOrder = 15)] + public string ScoreColumnName = "Scores"; + + /// + /// Name of the tensor that will contain the predicted label from output scores of the last layer when transfer learning is done. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Argmax tensor of the last layer in transfer learning.", SortOrder = 15)] + public string PredictedLabelColumnName = "PredictedLabel"; + + /// + /// Final model and checkpoint files/folder prefix for storing graph files. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Final model and checkpoint files/folder prefix for storing graph files.", SortOrder = 15)] + public string FinalModelPrefix = "custom_retrained_model_based_on_"; + + /// + /// Callback to report statistics on accuracy/cross entropy during training phase. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Callback to report metrics during training and validation phase.", SortOrder = 15)] + public ImageClassificationMetricsCallback MetricsCallback = null; + + /// + /// Frequency of epochs at which statistics on training phase should be reported. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Frequency of epochs at which statistics on training/validation phase should be reported.", SortOrder = 15)] + public int StatisticsFrequency = 1; + + /// + /// Indicates the choice DNN training framework. Currently only TensorFlow is supported. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates the choice DNN training framework. Currently only TensorFlow is supported.", SortOrder = 15)] + public DnnFramework Framework = DnnFramework.Tensorflow; + + /// + /// Indicates the path where the newly retrained model should be saved. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates the path where the newly retrained model should be saved.", SortOrder = 15)] + public string ModelSavePath = null; + + /// + /// Indicates to evaluate the model on train set after every epoch. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates to evaluate the model on train set after every epoch.", SortOrder = 15)] + public bool TestOnTrainSet; + + /// + /// Indicates to not re-compute cached bottleneck trainset values if already available in the bin folder. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates to not re-compute trained cached bottleneck values if already available in the bin folder.", SortOrder = 15)] + public bool ReuseTrainSetBottleneckCachedValues; + + /// + /// Indicates to not re-compute cached bottleneck validationset values if already available in the bin folder. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates to not re-compute validataionset cached bottleneck validationset values if already available in the bin folder.", SortOrder = 15)] + public bool ReuseValidationSetBottleneckCachedValues; + + /// + /// Validation set. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Validation set.", SortOrder = 15)] + public IDataView ValidationSet; + + /// + /// Indicates the file path to store trainset bottleneck values for caching. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates the file path to store trainset bottleneck values for caching.", SortOrder = 15)] + public string TrainSetBottleneckCachedValuesFilePath; + + /// + /// Indicates the file path to store validationset bottleneck values for caching. + /// + [Argument(ArgumentType.AtMostOnce, HelpText = "Indicates the file path to store validationset bottleneck values for caching.", SortOrder = 15)] + public string ValidationSetBottleneckCachedValuesFilePath; + } + + private readonly IHost _host; + private readonly Options _options; + private readonly DnnModel _dnnModel; + private readonly TF_DataType[] _tfInputTypes; + private readonly DataViewType[] _outputTypes; + private ImageClassificationTransformer _transformer; + + internal ImageClassificationEstimator(IHostEnvironment env, Options options, DnnModel dnnModel) + { + _host = Contracts.CheckRef(env, nameof(env)).Register(nameof(ImageClassificationEstimator)); + _options = options; + _dnnModel = dnnModel; + _tfInputTypes = new[] { TF_DataType.TF_STRING }; + _outputTypes = new[] { new VectorDataViewType(NumberDataViewType.Single), NumberDataViewType.UInt32.GetItemType() }; + } + + private static Options CreateArguments(DnnModel tensorFlowModel, string[] outputColumnNames, string[] inputColumnName, bool addBatchDimensionInput) + { + var options = new Options(); + options.ModelLocation = tensorFlowModel.ModelPath; + options.InputColumns = inputColumnName; + options.OutputColumns = outputColumnNames; + return options; + } + + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// + public SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + _host.CheckValue(inputSchema, nameof(inputSchema)); + var result = inputSchema.ToDictionary(x => x.Name); + var resultDic = inputSchema.ToDictionary(x => x.Name); + for (var i = 0; i < _options.InputColumns.Length; i++) + { + var input = _options.InputColumns[i]; + if (!inputSchema.TryFindColumn(input, out var col)) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", input); + var expectedType = DnnUtils.Tf2MlNetType(_tfInputTypes[i]); + if (col.ItemType != expectedType) + throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", input, expectedType.ToString(), col.ItemType.ToString()); + } + for (var i = 0; i < _options.OutputColumns.Length; i++) + { + resultDic[_options.OutputColumns[i]] = new SchemaShape.Column(_options.OutputColumns[i], + _outputTypes[i].IsKnownSizeVector() ? SchemaShape.Column.VectorKind.Vector + : SchemaShape.Column.VectorKind.VariableVector, _outputTypes[i].GetItemType(), false); + } + return new SchemaShape(resultDic.Values); + } + + /// + /// Trains and returns a . + /// + public ImageClassificationTransformer Fit(IDataView input) + { + _host.CheckValue(input, nameof(input)); + if (_transformer == null) + _transformer = new ImageClassificationTransformer(_host, _options, _dnnModel, input); + + // Validate input schema. + _transformer.GetOutputSchema(input.Schema); + return _transformer; + } + } +} diff --git a/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs b/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs index 52ebe6ccb7..d74ae8f563 100644 --- a/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.FastTree/Properties/AssemblyInfo.cs @@ -14,5 +14,6 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Internal.FastTree" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "RunTests" + InternalPublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.AutoML" + PublicKey.Value)] [assembly: WantsToBeBestFriends] diff --git a/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs b/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs index 7a8d7063dd..e1b8ef452e 100644 --- a/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs +++ b/src/Microsoft.ML.TensorFlow/TensorflowTransform.cs @@ -639,7 +639,10 @@ private void UpdateCacheIfNeeded(long position, ITensorValueGetter[] srcTensorGe // Feed inputs to the graph. for (int i = 0; i < _parent.Inputs.Length; i++) - runner.AddInput(_parent.Inputs[i], srcTensorGetters[i].GetTensor()); + { + var tensor = srcTensorGetters[i].GetTensor(); + runner.AddInput(_parent.Inputs[i], tensor); + } // Add outputs. for (int i = 0; i < _parent.Outputs.Length; i++) @@ -651,8 +654,12 @@ private void UpdateCacheIfNeeded(long position, ITensorValueGetter[] srcTensorGe Contracts.Assert(tensors.Length > 0); for (int j = 0; j < activeOutputColNames.Length; j++) - outputCache.Outputs[activeOutputColNames[j]] = tensors[j]; + { + if (outputCache.Outputs.TryGetValue(activeOutputColNames[j], out Tensor outTensor)) + outTensor.Dispose(); + outputCache.Outputs[activeOutputColNames[j]] = tensors[j]; + } outputCache.Position = position; } } @@ -704,7 +711,6 @@ private class TensorValueGetter : ITensorValueGetter private readonly T[] _bufferedData; private readonly TensorShape _tfShape; private int _position; - private readonly List _tensors; public TensorValueGetter(DataViewRow input, int colIndex, TensorShape tfShape) { @@ -719,7 +725,6 @@ public TensorValueGetter(DataViewRow input, int colIndex, TensorShape tfShape) size *= dim; } _bufferedData = new T[size]; - _tensors = new List(); } public Tensor GetTensor() @@ -728,7 +733,6 @@ public Tensor GetTensor() _srcgetter(ref scalar); var tensor = new Tensor(new[] { scalar }); tensor.SetShape(_tfShape); - _tensors.Add(tensor); return tensor; } @@ -743,7 +747,6 @@ public Tensor GetBufferedBatchTensor() { var tensor = new Tensor(new NDArray(_bufferedData, _tfShape)); _position = 0; - _tensors.Add(tensor); return tensor; } } @@ -757,7 +760,6 @@ private class TensorValueGetterVec : ITensorValueGetter private T[] _bufferedData; private int _position; private long[] _dims; - private readonly List _tensors; private readonly long _bufferedDataSize; public TensorValueGetterVec(DataViewRow input, int colIndex, TensorShape tfShape) @@ -778,7 +780,6 @@ public TensorValueGetterVec(DataViewRow input, int colIndex, TensorShape tfShape _bufferedData = new T[size]; if (_tfShape.Dimensions != null) _dims = _tfShape.Dimensions.Select(x => (long)x).ToArray(); - _tensors = new List(); _bufferedDataSize = size; } @@ -792,7 +793,6 @@ public Tensor GetTensor() _denseData = new T[_vBuffer.Length]; _vBuffer.CopyTo(_denseData); var tensor = CastDataAndReturnAsTensor(_denseData); - _tensors.Add(tensor); return tensor; } @@ -845,7 +845,6 @@ public Tensor GetBufferedBatchTensor() { _position = 0; var tensor = CastDataAndReturnAsTensor(_bufferedData); - _tensors.Add(tensor); _bufferedData = new T[_bufferedDataSize]; return tensor; diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs new file mode 100644 index 0000000000..bf4e260335 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs @@ -0,0 +1,64 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; +using Xunit; +using System.Linq; + +namespace Microsoft.ML.AutoML.Test +{ + + public class AutoFitTests + { + [Fact] + public void AutoFitBinaryTest() + { + var context = new MLContext(); + var dataPath = DatasetUtil.DownloadUciAdultDataset(); + var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel); + var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); + var trainData = textLoader.Load(dataPath); + var result = context.Auto() + .CreateBinaryClassificationExperiment(0) + .Execute(trainData, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel }); + Assert.True(result.BestRun.ValidationMetrics.Accuracy > 0.70); + Assert.NotNull(result.BestRun.Estimator); + Assert.NotNull(result.BestRun.Model); + Assert.NotNull(result.BestRun.TrainerName); + } + + [Fact] + public void AutoFitMultiTest() + { + var context = new MLContext(); + var columnInference = context.Auto().InferColumns(DatasetUtil.TrivialMulticlassDatasetPath, DatasetUtil.TrivialMulticlassDatasetLabel); + var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); + var trainData = textLoader.Load(DatasetUtil.TrivialMulticlassDatasetPath); + var result = context.Auto() + .CreateMulticlassClassificationExperiment(0) + .Execute(trainData, 5, DatasetUtil.TrivialMulticlassDatasetLabel); + Assert.True(result.BestRun.Results.First().ValidationMetrics.MicroAccuracy >= 0.7); + var scoredData = result.BestRun.Results.First().Model.Transform(trainData); + Assert.Equal(NumberDataViewType.Single, scoredData.Schema[DefaultColumnNames.PredictedLabel].Type); + } + + [Fact] + public void AutoFitRegressionTest() + { + var context = new MLContext(); + var dataPath = DatasetUtil.DownloadMlNetGeneratedRegressionDataset(); + var columnInference = context.Auto().InferColumns(dataPath, DatasetUtil.MlNetGeneratedRegressionLabel); + var textLoader = context.Data.CreateTextLoader(columnInference.TextLoaderOptions); + var trainData = textLoader.Load(dataPath); + var validationData = context.Data.TakeRows(trainData, 20); + trainData = context.Data.SkipRows(trainData, 20); + var result = context.Auto() + .CreateRegressionExperiment(0) + .Execute(trainData, validationData, + new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel }); + + Assert.True(result.RunDetails.Max(i => i.ValidationMetrics.RSquared > 0.9)); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/BestResultUtilTests.cs b/test/Microsoft.ML.AutoML.Tests/BestResultUtilTests.cs new file mode 100644 index 0000000000..8765bffc13 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/BestResultUtilTests.cs @@ -0,0 +1,63 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class BestResultUtilTests + { + [Fact] + public void FindBestResultWithSomeNullMetrics() + { + var metrics1 = MetricsUtil.CreateRegressionMetrics(0.2, 0.2, 0.2, 0.2, 0.2); + var metrics2 = MetricsUtil.CreateRegressionMetrics(0.3, 0.3, 0.3, 0.3, 0.3); + var metrics3 = MetricsUtil.CreateRegressionMetrics(0.1, 0.1, 0.1, 0.1, 0.1); + + var runResults = new List>() + { + new RunDetail(null, null, null, null, null, null), + new RunDetail(null, null, null, null, metrics1, null), + new RunDetail(null, null, null, null, metrics2, null), + new RunDetail(null, null, null, null, metrics3, null), + }; + + var metricsAgent = new RegressionMetricsAgent(null, RegressionMetric.RSquared); + var bestResult = BestResultUtil.GetBestRun(runResults, metricsAgent, true); + Assert.Equal(0.3, bestResult.ValidationMetrics.RSquared); + } + + [Fact] + public void FindBestResultWithAllNullMetrics() + { + var runResults = new List>() + { + new RunDetail(null, null, null, null, null, null), + }; + + var metricsAgent = new RegressionMetricsAgent(null, RegressionMetric.RSquared); + var bestResult = BestResultUtil.GetBestRun(runResults, metricsAgent, true); + Assert.Null(bestResult); + } + + [Fact] + public void GetIndexOfBestScoreMaximizingUtil() + { + var scores = new double[] { 0, 2, 5, 100, -100, -70 }; + var indexOfMaxScore = BestResultUtil.GetIndexOfBestScore(scores, true); + Assert.Equal(3, indexOfMaxScore); + } + + [Fact] + public void GetIndexOfBestScoreMinimizingUtil() + { + var scores = new double[] { 0, 2, 5, 100, -100, -70 }; + var indexOfMaxScore = BestResultUtil.GetIndexOfBestScore(scores, false); + Assert.Equal(4, indexOfMaxScore); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs new file mode 100644 index 0000000000..5545615b9d --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceTests.cs @@ -0,0 +1,150 @@ +using System; +using System.IO; +using System.Linq; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class ColumnInferenceTests + { + [Fact] + public void UnGroupReturnsMoreColumnsThanGroup() + { + var dataPath = DatasetUtil.DownloadUciAdultDataset(); + var context = new MLContext(); + var columnInferenceWithoutGrouping = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel, groupColumns: false); + foreach (var col in columnInferenceWithoutGrouping.TextLoaderOptions.Columns) + { + Assert.False(col.Source.Length > 1 || col.Source[0].Min != col.Source[0].Max); + } + + var columnInferenceWithGrouping = context.Auto().InferColumns(dataPath, DatasetUtil.UciAdultLabel, groupColumns: true); + Assert.True(columnInferenceWithGrouping.TextLoaderOptions.Columns.Count() < columnInferenceWithoutGrouping.TextLoaderOptions.Columns.Count()); + } + + [Fact] + public void IncorrectLabelColumnThrows() + { + var dataPath = DatasetUtil.DownloadUciAdultDataset(); + var context = new MLContext(); + Assert.Throws(new System.Action(() => context.Auto().InferColumns(dataPath, "Junk", groupColumns: false))); + } + + [Fact] + public void LabelIndexOutOfBoundsThrows() + { + Assert.Throws(() => new MLContext().Auto().InferColumns(DatasetUtil.DownloadUciAdultDataset(), 100)); + } + + [Fact] + public void IdentifyLabelColumnThroughIndexWithHeader() + { + var result = new MLContext().Auto().InferColumns(DatasetUtil.DownloadUciAdultDataset(), 14, hasHeader: true); + Assert.True(result.TextLoaderOptions.HasHeader); + var labelCol = result.TextLoaderOptions.Columns.First(c => c.Source[0].Min == 14 && c.Source[0].Max == 14); + Assert.Equal("hours-per-week", labelCol.Name); + Assert.Equal("hours-per-week", result.ColumnInformation.LabelColumnName); + } + + [Fact] + public void IdentifyLabelColumnThroughIndexWithoutHeader() + { + var result = new MLContext().Auto().InferColumns(DatasetUtil.DownloadIrisDataset(), DatasetUtil.IrisDatasetLabelColIndex); + Assert.False(result.TextLoaderOptions.HasHeader); + var labelCol = result.TextLoaderOptions.Columns.First(c => c.Source[0].Min == DatasetUtil.IrisDatasetLabelColIndex && + c.Source[0].Max == DatasetUtil.IrisDatasetLabelColIndex); + Assert.Equal(DefaultColumnNames.Label, labelCol.Name); + Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName); + } + + [Fact] + public void DatasetWithEmptyColumn() + { + var result = new MLContext().Auto().InferColumns(Path.Combine("TestData", "DatasetWithEmptyColumn.txt"), DefaultColumnNames.Label, groupColumns: false); + var emptyColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "Empty"); + Assert.Equal(DataKind.Single, emptyColumn.DataKind); + } + + [Fact] + public void DatasetWithBoolColumn() + { + var result = new MLContext().Auto().InferColumns(Path.Combine("TestData", "BinaryDatasetWithBoolColumn.txt"), DefaultColumnNames.Label); + Assert.Equal(2, result.TextLoaderOptions.Columns.Count()); + + var boolColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "Bool"); + var labelColumn = result.TextLoaderOptions.Columns.First(c => c.Name == DefaultColumnNames.Label); + // ensure non-label Boolean column is detected as R4 + Assert.Equal(DataKind.Single, boolColumn.DataKind); + Assert.Equal(DataKind.Boolean, labelColumn.DataKind); + + // ensure non-label Boolean column is detected as R4 + Assert.Single(result.ColumnInformation.NumericColumnNames); + Assert.Equal("Bool", result.ColumnInformation.NumericColumnNames.First()); + Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName); + } + + [Fact] + public void WhereNameColumnIsOnlyFeature() + { + var result = new MLContext().Auto().InferColumns(Path.Combine("TestData", "NameColumnIsOnlyFeatureDataset.txt"), DefaultColumnNames.Label); + Assert.Equal(2, result.TextLoaderOptions.Columns.Count()); + + var nameColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "Username"); + var labelColumn = result.TextLoaderOptions.Columns.First(c => c.Name == DefaultColumnNames.Label); + Assert.Equal(DataKind.String, nameColumn.DataKind); + Assert.Equal(DataKind.Boolean, labelColumn.DataKind); + + Assert.Single(result.ColumnInformation.TextColumnNames); + Assert.Equal("Username", result.ColumnInformation.TextColumnNames.First()); + Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName); + } + + [Fact] + public void DefaultColumnNamesInferredCorrectly() + { + var result = new MLContext().Auto().InferColumns(Path.Combine("TestData", "DatasetWithDefaultColumnNames.txt"), + new ColumnInformation() + { + LabelColumnName = DefaultColumnNames.Label, + ExampleWeightColumnName = DefaultColumnNames.Weight, + }, + groupColumns : false); + + Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName); + Assert.Equal(DefaultColumnNames.Weight, result.ColumnInformation.ExampleWeightColumnName); + Assert.Equal(3, result.ColumnInformation.NumericColumnNames.Count()); + } + + [Fact] + public void DefaultColumnNamesNoGrouping() + { + var result = new MLContext().Auto().InferColumns(Path.Combine("TestData", "DatasetWithDefaultColumnNames.txt"), + new ColumnInformation() + { + LabelColumnName = DefaultColumnNames.Label, + ExampleWeightColumnName = DefaultColumnNames.Weight, + }); + + Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName); + Assert.Equal(DefaultColumnNames.Weight, result.ColumnInformation.ExampleWeightColumnName); + Assert.Single(result.ColumnInformation.NumericColumnNames); + Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First()); + } + + [Fact] + public void InferColumnsColumnInfoParam() + { + var columnInfo = new ColumnInformation() { LabelColumnName = DatasetUtil.MlNetGeneratedRegressionLabel }; + var result = new MLContext().Auto().InferColumns(DatasetUtil.DownloadMlNetGeneratedRegressionDataset(), + columnInfo); + var labelCol = result.TextLoaderOptions.Columns.First(c => c.Name == DatasetUtil.MlNetGeneratedRegressionLabel); + Assert.Equal(DataKind.Single, labelCol.DataKind); + Assert.Equal(DatasetUtil.MlNetGeneratedRegressionLabel, result.ColumnInformation.LabelColumnName); + Assert.Single(result.ColumnInformation.NumericColumnNames); + Assert.Equal(DefaultColumnNames.Features, result.ColumnInformation.NumericColumnNames.First()); + Assert.Null(result.ColumnInformation.ExampleWeightColumnName); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInferenceValidationUtilTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceValidationUtilTests.cs new file mode 100644 index 0000000000..b990c4d3d9 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInferenceValidationUtilTests.cs @@ -0,0 +1,28 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class ColumnInferenceValidationUtilTests + { + [Fact] + public void ValidateColumnNotContainedInData() + { + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single); + schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + var dataView = DataViewTestFixture.BuildDummyDataView(schema); + var columnInfo = new ColumnInformation(); + columnInfo.CategoricalColumnNames.Add("Categorical"); + Assert.Throws(() => ColumnInferenceValidationUtil.ValidateSpecifiedColumnsExist(columnInfo, dataView)); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/ColumnInformationUtilTests.cs b/test/Microsoft.ML.AutoML.Tests/ColumnInformationUtilTests.cs new file mode 100644 index 0000000000..a181da972e --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/ColumnInformationUtilTests.cs @@ -0,0 +1,57 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Linq; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class ColumnInformationUtilTests + { + [Fact] + public void GetColumnPurpose() + { + var columnInfo = new ColumnInformation() + { + LabelColumnName = "Label", + ExampleWeightColumnName = "Weight", + SamplingKeyColumnName = "SamplingKey", + }; + columnInfo.CategoricalColumnNames.Add("Cat"); + columnInfo.NumericColumnNames.Add("Num"); + columnInfo.TextColumnNames.Add("Text"); + columnInfo.IgnoredColumnNames.Add("Ignored"); + + Assert.Equal(ColumnPurpose.Label, ColumnInformationUtil.GetColumnPurpose(columnInfo, "Label")); + Assert.Equal(ColumnPurpose.Weight, ColumnInformationUtil.GetColumnPurpose(columnInfo, "Weight")); + Assert.Equal(ColumnPurpose.SamplingKey, ColumnInformationUtil.GetColumnPurpose(columnInfo, "SamplingKey")); + Assert.Equal(ColumnPurpose.CategoricalFeature, ColumnInformationUtil.GetColumnPurpose(columnInfo, "Cat")); + Assert.Equal(ColumnPurpose.NumericFeature, ColumnInformationUtil.GetColumnPurpose(columnInfo, "Num")); + Assert.Equal(ColumnPurpose.TextFeature, ColumnInformationUtil.GetColumnPurpose(columnInfo, "Text")); + Assert.Equal(ColumnPurpose.Ignore, ColumnInformationUtil.GetColumnPurpose(columnInfo, "Ignored")); + Assert.Null(ColumnInformationUtil.GetColumnPurpose(columnInfo, "NonExistent")); + } + + [Fact] + public void GetColumnNamesTest() + { + var columnInfo = new ColumnInformation() + { + LabelColumnName = "Label", + SamplingKeyColumnName = "SamplingKey", + }; + columnInfo.CategoricalColumnNames.Add("Cat1"); + columnInfo.CategoricalColumnNames.Add("Cat2"); + columnInfo.NumericColumnNames.Add("Num"); + var columnNames = ColumnInformationUtil.GetColumnNames(columnInfo); + Assert.Equal(5, columnNames.Count()); + Assert.Contains("Label", columnNames); + Assert.Contains("SamplingKey", columnNames); + Assert.Contains("Cat1", columnNames); + Assert.Contains("Cat2", columnNames); + Assert.Contains("Num", columnNames); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/ConversionTests.cs b/test/Microsoft.ML.AutoML.Tests/ConversionTests.cs new file mode 100644 index 0000000000..d35007370a --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/ConversionTests.cs @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Data.Conversion; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.AutoML.Test +{ + + public class ConversionTests + { + private readonly ITestOutputHelper output; + + public ConversionTests(ITestOutputHelper output) + { + this.output = output; + } + + [Fact] + public void ConvertFloatMissingValues() + { + var missingValues = new string[] + { + //"", + "?", " ", + "na", "n/a", "nan", + "NA", "N/A", "NaN", "NAN" + }; + + foreach(var missingValue in missingValues) + { + float value; + var success = Conversions.Instance.TryParse(missingValue.AsMemory(), out value); + output.WriteLine($"{missingValue} parsed as {value}"); + Assert.True(success); + //Assert.Equal(float.NaN, value); + } + } + + [Fact] + public void ConvertFloatParseFailure() + { + var values = new string[] + { + "a", "aa", "nb", "aaa", "naa", "nba", "n/b" + }; + + foreach (var value in values) + { + var success = Conversions.Instance.TryParse(value.AsMemory(), out float _); + Assert.False(success); + } + } + + [Fact] + public void ConvertBoolMissingValues() + { + var missingValues = new string[] + { + "", + "no", "NO", "+1", "-1", + "yes", "YES", + "true", "TRUE", + "false", "FALSE" + }; + + foreach (var missingValue in missingValues) + { + var success = Conversions.Instance.TryParse(missingValue.AsMemory(), out bool _); + Assert.True(success); + } + } + + [Fact] + public void ConvertBoolParseFailure() + { + var values = new string[] + { + "aa", "na", "+a", "-a", + "aaa", "yaa", "yea", + "aaaa", "taaa", "traa", "trua", + "aaaaa", "fbbbb", "faaaa", "falaa", "falsa" + }; + + foreach (var value in values) + { + var success = Conversions.Instance.TryParse(value.AsMemory(), out bool _); + Assert.False(success); + } + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/DatasetDimensionsTests.cs b/test/Microsoft.ML.AutoML.Tests/DatasetDimensionsTests.cs new file mode 100644 index 0000000000..cf9ec496da --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/DatasetDimensionsTests.cs @@ -0,0 +1,86 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class DatasetDimensionsTests + { + public object DatasetDimensionUtil { get; private set; } + + [Fact] + public void TextColumnDimensionsTest() + { + var context = new MLContext(); + var dataBuilder = new ArrayDataViewBuilder(context); + dataBuilder.AddColumn("categorical", new string[] { "0", "1", "0", "1", "0", "1", "2", "2", "0", "1" }); + dataBuilder.AddColumn("text", new string[] { "0", "1", "0", "1", "0", "1", "2", "2", "0", "1" }); + var data = dataBuilder.GetDataView(); + var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] { + new PurposeInference.Column(0, ColumnPurpose.CategoricalFeature), + new PurposeInference.Column(0, ColumnPurpose.TextFeature), + }); + Assert.NotNull(dimensions); + Assert.Equal(2, dimensions.Length); + Assert.Equal(3, dimensions[0].Cardinality); + Assert.Null(dimensions[1].Cardinality); + Assert.Null(dimensions[0].HasMissing); + Assert.Null(dimensions[1].HasMissing); + } + + [Fact] + public void FloatColumnDimensionsTest() + { + var context = new MLContext(); + var dataBuilder = new ArrayDataViewBuilder(context); + dataBuilder.AddColumn("NoNan", NumberDataViewType.Single, new float[] { 0, 1, 0, 1, 0 }); + dataBuilder.AddColumn("Nan", NumberDataViewType.Single, new float[] { 0, 1, 0, 1, float.NaN }); + var data = dataBuilder.GetDataView(); + var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] { + new PurposeInference.Column(0, ColumnPurpose.NumericFeature), + new PurposeInference.Column(1, ColumnPurpose.NumericFeature), + }); + Assert.NotNull(dimensions); + Assert.Equal(2, dimensions.Length); + Assert.Null(dimensions[0].Cardinality); + Assert.Null(dimensions[1].Cardinality); + Assert.False(dimensions[0].HasMissing); + Assert.True(dimensions[1].HasMissing); + } + + [Fact] + public void FloatVectorColumnHasNanTest() + { + var context = new MLContext(); + var dataBuilder = new ArrayDataViewBuilder(context); + var slotNames = new[] { "Col1", "Col2" }; + var colValues = new float[][] + { + new float[] { 0, 0 }, + new float[] { 1, 1 }, + }; + dataBuilder.AddColumn("NoNan", Util.GetKeyValueGetter(slotNames), NumberDataViewType.Single, colValues); + colValues = new float[][] + { + new float[] { 0, 0 }, + new float[] { 1, float.NaN }, + }; + dataBuilder.AddColumn("Nan", Util.GetKeyValueGetter(slotNames), NumberDataViewType.Single, colValues); + var data = dataBuilder.GetDataView(); + var dimensions = DatasetDimensionsApi.CalcColumnDimensions(context, data, new[] { + new PurposeInference.Column(0, ColumnPurpose.NumericFeature), + new PurposeInference.Column(1, ColumnPurpose.NumericFeature), + }); + Assert.NotNull(dimensions); + Assert.Equal(2, dimensions.Length); + Assert.Null(dimensions[0].Cardinality); + Assert.Null(dimensions[1].Cardinality); + Assert.False(dimensions[0].HasMissing); + Assert.True(dimensions[1].HasMissing); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/DatasetUtil.cs b/test/Microsoft.ML.AutoML.Tests/DatasetUtil.cs new file mode 100644 index 0000000000..95d4cccc52 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/DatasetUtil.cs @@ -0,0 +1,86 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using System.Linq; +using System.Net; +using System.Threading; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Test +{ + internal static class DatasetUtil + { + public const string UciAdultLabel = DefaultColumnNames.Label; + public const string TrivialMulticlassDatasetLabel = "Target"; + public const string MlNetGeneratedRegressionLabel = "target"; + public const int IrisDatasetLabelColIndex = 0; + + public static string TrivialMulticlassDatasetPath = Path.Combine("TestData", "TrivialMulticlassDataset.txt"); + + private static IDataView _uciAdultDataView; + + public static IDataView GetUciAdultDataView() + { + if(_uciAdultDataView == null) + { + var context = new MLContext(); + var uciAdultDataFile = DownloadUciAdultDataset(); + var columnInferenceResult = context.Auto().InferColumns(uciAdultDataFile, UciAdultLabel); + var textLoader = context.Data.CreateTextLoader(columnInferenceResult.TextLoaderOptions); + _uciAdultDataView = textLoader.Load(uciAdultDataFile); + } + return _uciAdultDataView; + } + + // downloads the UCI Adult dataset from the ML.Net repo + public static string DownloadUciAdultDataset() => + DownloadIfNotExists("https://raw.githubusercontent.com/dotnet/machinelearning/f0e639af5ffdc839aae8e65d19b5a9a1f0db634a/test/data/adult.tiny.with-schema.txt", "uciadult.dataset"); + + public static string DownloadMlNetGeneratedRegressionDataset() => + DownloadIfNotExists("https://raw.githubusercontent.com/dotnet/machinelearning/e78971ea6fd736038b4c355b840e5cbabae8cb55/test/data/generated_regression_dataset.csv", "mlnet_generated_regression.dataset"); + + public static string DownloadIrisDataset() => + DownloadIfNotExists("https://raw.githubusercontent.com/dotnet/machinelearning/54596ac/test/data/iris.txt", "iris.dataset"); + + private static string DownloadIfNotExists(string baseGitPath, string dataFile) + { + foreach (var nextIteration in Enumerable.Range(0, 10)) + { + // if file doesn't already exist, download it + if (!File.Exists(dataFile)) + { + var tempFile = Path.GetTempFileName(); + + try + { + using (var client = new WebClient()) + { + client.DownloadFile(new Uri($"{baseGitPath}"), tempFile); + + if (!File.Exists(dataFile)) + { + File.Copy(tempFile, dataFile); + File.Delete(tempFile); + } + } + } + catch(Exception) + { + } + } + + if (File.Exists(dataFile) && (new FileInfo(dataFile).Length > 0)) + { + return dataFile; + } + + Thread.Sleep(300); + } + + throw new Exception($"Failed to download test file {dataFile}."); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/EstimatorExtensionTests.cs b/test/Microsoft.ML.AutoML.Tests/EstimatorExtensionTests.cs new file mode 100644 index 0000000000..fdbe7573d3 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/EstimatorExtensionTests.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Linq; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class EstimatorExtensionTests + { + [Fact] + public void EstimatorExtensionInstanceTests() + { + var context = new MLContext(); + var pipelineNode = new PipelineNode() + { + InColumns = new string[] { "Input" }, + OutColumns = new string[] { "Output" } + }; + + var estimatorNames = Enum.GetValues(typeof(EstimatorName)).Cast(); + foreach (var estimatorName in estimatorNames) + { + var extension = EstimatorExtensionCatalog.GetExtension(estimatorName); + var instance = extension.CreateInstance(context, pipelineNode); + Assert.NotNull(instance); + } + } + + [Fact] + public void EstimatorExtensionStaticTests() + { + var context = new MLContext(); + var inCol = "Input"; + var outCol = "Output"; + var inCols = new string[] { inCol }; + var outCols = new string[] { outCol }; + Assert.NotNull(ColumnConcatenatingExtension.CreateSuggestedTransform(context, inCols, outCol)); + Assert.NotNull(ColumnCopyingExtension.CreateSuggestedTransform(context, inCol, outCol)); + Assert.NotNull(MissingValueIndicatingExtension.CreateSuggestedTransform(context, inCols, outCols)); + Assert.NotNull(MissingValueReplacingExtension.CreateSuggestedTransform(context, inCols, outCols)); + Assert.NotNull(NormalizingExtension.CreateSuggestedTransform(context, inCol, outCol)); + Assert.NotNull(OneHotEncodingExtension.CreateSuggestedTransform(context, inCols, outCols)); + Assert.NotNull(OneHotHashEncodingExtension.CreateSuggestedTransform(context, inCols, outCols)); + Assert.NotNull(TextFeaturizingExtension.CreateSuggestedTransform(context, inCol, outCol)); + Assert.NotNull(TypeConvertingExtension.CreateSuggestedTransform(context, inCols, outCols)); + Assert.NotNull(ValueToKeyMappingExtension.CreateSuggestedTransform(context, inCol, outCol)); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/GetNextPipelineTests.cs b/test/Microsoft.ML.AutoML.Tests/GetNextPipelineTests.cs new file mode 100644 index 0000000000..4846331903 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/GetNextPipelineTests.cs @@ -0,0 +1,84 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Xunit; +using Newtonsoft.Json; + +namespace Microsoft.ML.AutoML.Test +{ + + public class GetNextPipelineTests + { + [Fact] + public void GetNextPipeline() + { + var context = new MLContext(); + var uciAdult = DatasetUtil.GetUciAdultDataView(); + var columns = DatasetColumnInfoUtil.GetDatasetColumnInfo(context, uciAdult, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel }); + + // get next pipeline + var pipeline = PipelineSuggester.GetNextPipeline(context, new List(), columns, TaskKind.BinaryClassification); + + // serialize & deserialize pipeline + var serialized = JsonConvert.SerializeObject(pipeline); + Console.WriteLine(serialized); + var deserialized = JsonConvert.DeserializeObject(serialized); + + // run pipeline + var estimator = deserialized.ToEstimator(context); + var scoredData = estimator.Fit(uciAdult).Transform(uciAdult); + var score = context.BinaryClassification.EvaluateNonCalibrated(scoredData).Accuracy; + var result = new PipelineScore(deserialized, score, true); + + Assert.NotNull(result); + } + + [Fact] + public void GetNextPipelineMock() + { + var context = new MLContext(); + var uciAdult = DatasetUtil.GetUciAdultDataView(); + var columns = DatasetColumnInfoUtil.GetDatasetColumnInfo(context, uciAdult, new ColumnInformation() { LabelColumnName = DatasetUtil.UciAdultLabel }); + + // Get next pipeline loop + var history = new List(); + var task = TaskKind.BinaryClassification; + var maxIterations = 60; + for (var i = 0; i < maxIterations; i++) + { + // Get next pipeline + var pipeline = PipelineSuggester.GetNextPipeline(context, history, columns, task); + if (pipeline == null) + { + break; + } + + var result = new PipelineScore(pipeline, AutoMlUtils.Random.Value.NextDouble(), true); + history.Add(result); + } + + Assert.Equal(maxIterations, history.Count); + + // Get all 'Stage 1' and 'Stage 2' runs from Pipeline Suggester + var allAvailableTrainers = RecipeInference.AllowedTrainers(context, task, new ColumnInformation(), null); + var stage1Runs = history.Take(allAvailableTrainers.Count()); + var stage2Runs = history.Skip(allAvailableTrainers.Count()); + + // Get the trainer names from top 3 Stage 1 runs + var topStage1Runs = stage1Runs.OrderByDescending(r => r.Score).Take(3); + var topStage1TrainerNames = topStage1Runs.Select(r => r.Pipeline.Nodes.Last().Name); + + // Get unique trainer names from Stage 2 runs + var stage2TrainerNames = stage2Runs.Select(r => r.Pipeline.Nodes.Last().Name).Distinct(); + + // Assert that are only 3 unique trainers used in stage 2 + Assert.Equal(3, stage2TrainerNames.Count()); + // Assert that all trainers in stage 2 were the top trainers from stage 1 + Assert.False(topStage1TrainerNames.Except(stage2TrainerNames).Any()); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/InferredPipelineTests.cs b/test/Microsoft.ML.AutoML.Tests/InferredPipelineTests.cs new file mode 100644 index 0000000000..fc969bd009 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/InferredPipelineTests.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class InferredPipelineTests + { + [Fact] + public void InferredPipelinesHashTest() + { + var context = new MLContext(); + var columnInfo = new ColumnInformation(); + + // test same learners with no hyperparams have the same hash code + var trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo); + var trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo); + var transforms1 = new List(); + var transforms2 = new List(); + var inferredPipeline1 = new SuggestedPipeline(transforms1, new List(), trainer1, context, false); + var inferredPipeline2 = new SuggestedPipeline(transforms2, new List(), trainer2, context, false); + Assert.Equal(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // test same learners with hyperparams set vs empty hyperparams have different hash codes + var hyperparams1 = new ParameterSet(new List() { new LongParameterValue("NumberOfLeaves", 2) }); + trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo, hyperparams1); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo); + inferredPipeline1 = new SuggestedPipeline(transforms1, new List(), trainer1, context, false); + inferredPipeline2 = new SuggestedPipeline(transforms2, new List(), trainer2, context, false); + Assert.NotEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // same learners with different hyperparams + hyperparams1 = new ParameterSet(new List() { new LongParameterValue("NumberOfLeaves", 2) }); + var hyperparams2 = new ParameterSet(new List() { new LongParameterValue("NumberOfLeaves", 6) }); + trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo, hyperparams1); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo, hyperparams2); + inferredPipeline1 = new SuggestedPipeline(transforms1, new List(), trainer1, context, false); + inferredPipeline2 = new SuggestedPipeline(transforms2, new List(), trainer2, context, false); + Assert.NotEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // same learners with same transforms + trainer1 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo); + transforms1 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + transforms2 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + inferredPipeline1 = new SuggestedPipeline(transforms1, new List(), trainer1, context, false); + inferredPipeline2 = new SuggestedPipeline(transforms2, new List(), trainer2, context, false); + Assert.Equal(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + + // same transforms with different learners + trainer1 = new SuggestedTrainer(context, new SdcaLogisticRegressionBinaryExtension(), columnInfo); + trainer2 = new SuggestedTrainer(context, new LightGbmBinaryExtension(), columnInfo); + transforms1 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + transforms2 = new List() { ColumnConcatenatingExtension.CreateSuggestedTransform(context, new[] { "In" }, "Out") }; + inferredPipeline1 = new SuggestedPipeline(transforms1, new List(), trainer1, context, false); + inferredPipeline2 = new SuggestedPipeline(transforms2, new List(), trainer2, context, false); + Assert.NotEqual(inferredPipeline1.GetHashCode(), inferredPipeline2.GetHashCode()); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/MetricsAgentsTests.cs b/test/Microsoft.ML.AutoML.Tests/MetricsAgentsTests.cs new file mode 100644 index 0000000000..8fbd2aaeaf --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/MetricsAgentsTests.cs @@ -0,0 +1,165 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Data; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.AutoML.Test +{ + public class MetricsAgentsTests + { + [Fact] + public void BinaryMetricsGetScoreTest() + { + var metrics = MetricsUtil.CreateBinaryClassificationMetrics(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8); + Assert.Equal(0.1, GetScore(metrics, BinaryClassificationMetric.AreaUnderRocCurve)); + Assert.Equal(0.2, GetScore(metrics, BinaryClassificationMetric.Accuracy)); + Assert.Equal(0.3, GetScore(metrics, BinaryClassificationMetric.PositivePrecision)); + Assert.Equal(0.4, GetScore(metrics, BinaryClassificationMetric.PositiveRecall)); + Assert.Equal(0.5, GetScore(metrics, BinaryClassificationMetric.NegativePrecision)); + Assert.Equal(0.6, GetScore(metrics, BinaryClassificationMetric.NegativeRecall)); + Assert.Equal(0.7, GetScore(metrics, BinaryClassificationMetric.F1Score)); + Assert.Equal(0.8, GetScore(metrics, BinaryClassificationMetric.AreaUnderPrecisionRecallCurve)); + } + + [Fact] + public void BinaryMetricsNonPerfectTest() + { + var metrics = MetricsUtil.CreateBinaryClassificationMetrics(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.Accuracy)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.AreaUnderRocCurve)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.AreaUnderPrecisionRecallCurve)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.F1Score)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.NegativePrecision)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.NegativeRecall)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.PositivePrecision)); + Assert.False(IsPerfectModel(metrics, BinaryClassificationMetric.PositiveRecall)); + } + + [Fact] + public void BinaryMetricsPerfectTest() + { + var metrics = MetricsUtil.CreateBinaryClassificationMetrics(1, 1, 1, 1, 1, 1, 1, 1); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.Accuracy)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.AreaUnderRocCurve)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.AreaUnderPrecisionRecallCurve)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.F1Score)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.NegativePrecision)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.NegativeRecall)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.PositivePrecision)); + Assert.True(IsPerfectModel(metrics, BinaryClassificationMetric.PositiveRecall)); + } + + [Fact] + public void MulticlassMetricsGetScoreTest() + { + var metrics = MetricsUtil.CreateMulticlassClassificationMetrics(0.1, 0.2, 0.3, 0.4, 0, 0.5, new double[] {}); + Assert.Equal(0.1, GetScore(metrics, MulticlassClassificationMetric.MicroAccuracy)); + Assert.Equal(0.2, GetScore(metrics, MulticlassClassificationMetric.MacroAccuracy)); + Assert.Equal(0.3, GetScore(metrics, MulticlassClassificationMetric.LogLoss)); + Assert.Equal(0.4, GetScore(metrics, MulticlassClassificationMetric.LogLossReduction)); + Assert.Equal(0.5, GetScore(metrics, MulticlassClassificationMetric.TopKAccuracy)); + } + + [Fact] + public void MulticlassMetricsNonPerfectTest() + { + var metrics = MetricsUtil.CreateMulticlassClassificationMetrics(0.1, 0.2, 0.3, 0.4, 0, 0.5, new double[] { }); + Assert.False(IsPerfectModel(metrics, MulticlassClassificationMetric.MacroAccuracy)); + Assert.False(IsPerfectModel(metrics, MulticlassClassificationMetric.MicroAccuracy)); + Assert.False(IsPerfectModel(metrics, MulticlassClassificationMetric.LogLoss)); + Assert.False(IsPerfectModel(metrics, MulticlassClassificationMetric.LogLossReduction)); + Assert.False(IsPerfectModel(metrics, MulticlassClassificationMetric.TopKAccuracy)); + } + + [Fact] + public void MulticlassMetricsPerfectTest() + { + var metrics = MetricsUtil.CreateMulticlassClassificationMetrics(1, 1, 0, 1, 0, 1, new double[] { }); + Assert.True(IsPerfectModel(metrics, MulticlassClassificationMetric.MicroAccuracy)); + Assert.True(IsPerfectModel(metrics, MulticlassClassificationMetric.MacroAccuracy)); + Assert.True(IsPerfectModel(metrics, MulticlassClassificationMetric.LogLoss)); + Assert.True(IsPerfectModel(metrics, MulticlassClassificationMetric.LogLossReduction)); + Assert.True(IsPerfectModel(metrics, MulticlassClassificationMetric.TopKAccuracy)); + } + + [Fact] + public void RegressionMetricsGetScoreTest() + { + var metrics = MetricsUtil.CreateRegressionMetrics(0.2, 0.3, 0.4, 0.5, 0.6); + Assert.Equal(0.2, GetScore(metrics, RegressionMetric.MeanAbsoluteError)); + Assert.Equal(0.3, GetScore(metrics, RegressionMetric.MeanSquaredError)); + Assert.Equal(0.4, GetScore(metrics, RegressionMetric.RootMeanSquaredError)); + Assert.Equal(0.6, GetScore(metrics, RegressionMetric.RSquared)); + } + + [Fact] + public void RegressionMetricsNonPerfectTest() + { + var metrics = MetricsUtil.CreateRegressionMetrics(0.2, 0.3, 0.4, 0.5, 0.6); + Assert.False(IsPerfectModel(metrics, RegressionMetric.MeanAbsoluteError)); + Assert.False(IsPerfectModel(metrics, RegressionMetric.MeanSquaredError)); + Assert.False(IsPerfectModel(metrics, RegressionMetric.RootMeanSquaredError)); + Assert.False(IsPerfectModel(metrics, RegressionMetric.RSquared)); + } + + [Fact] + public void RegressionMetricsPerfectTest() + { + var metrics = MetricsUtil.CreateRegressionMetrics(0, 0, 0, 0, 1); + Assert.True(IsPerfectModel(metrics, RegressionMetric.MeanAbsoluteError)); + Assert.True(IsPerfectModel(metrics, RegressionMetric.MeanSquaredError)); + Assert.True(IsPerfectModel(metrics, RegressionMetric.RootMeanSquaredError)); + Assert.True(IsPerfectModel(metrics, RegressionMetric.RSquared)); + } + + [Fact] + public void ThrowNotSupportedMetricException() + { + var ex = MetricsAgentUtil.BuildMetricNotSupportedException(BinaryClassificationMetric.Accuracy); + Assert.Equal(typeof(NotSupportedException), ex.GetType()); + } + + private static double GetScore(BinaryClassificationMetrics metrics, BinaryClassificationMetric metric) + { + return new BinaryMetricsAgent(null, metric).GetScore(metrics); + } + + private static double GetScore(MulticlassClassificationMetrics metrics, MulticlassClassificationMetric metric) + { + return new MultiMetricsAgent(null, metric).GetScore(metrics); + } + + private static double GetScore(RegressionMetrics metrics, RegressionMetric metric) + { + return new RegressionMetricsAgent(null, metric).GetScore(metrics); + } + + private static bool IsPerfectModel(BinaryClassificationMetrics metrics, BinaryClassificationMetric metric) + { + var metricsAgent = new BinaryMetricsAgent(null, metric); + return IsPerfectModel(metricsAgent, metrics); + } + + private static bool IsPerfectModel(MulticlassClassificationMetrics metrics, MulticlassClassificationMetric metric) + { + var metricsAgent = new MultiMetricsAgent(null, metric); + return IsPerfectModel(metricsAgent, metrics); + } + + private static bool IsPerfectModel(RegressionMetrics metrics, RegressionMetric metric) + { + var metricsAgent = new RegressionMetricsAgent(null, metric); + return IsPerfectModel(metricsAgent, metrics); + } + + private static bool IsPerfectModel(IMetricsAgent metricsAgent, TMetrics metrics) + { + var score = metricsAgent.GetScore(metrics); + return metricsAgent.IsModelPerfect(score); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/MetricsUtil.cs b/test/Microsoft.ML.AutoML.Tests/MetricsUtil.cs new file mode 100644 index 0000000000..4248c3ca4e --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/MetricsUtil.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Reflection; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Test +{ + internal static class MetricsUtil + { + public static BinaryClassificationMetrics CreateBinaryClassificationMetrics( + double auc, double accuracy, double positivePrecision, + double positiveRecall, double negativePrecision, + double negativeRecall, double f1Score, double auprc) + { + return CreateInstance(auc, accuracy, + positivePrecision, positiveRecall, negativePrecision, + negativeRecall, f1Score, auprc); + } + + public static MulticlassClassificationMetrics CreateMulticlassClassificationMetrics( + double accuracyMicro, double accuracyMacro, double logLoss, + double logLossReduction, int topK, double topKAccuracy, + double[] perClassLogLoss) + { + return CreateInstance(accuracyMicro, + accuracyMacro, logLoss, logLossReduction, topK, + topKAccuracy, perClassLogLoss); + } + + public static RegressionMetrics CreateRegressionMetrics(double l1, + double l2, double rms, double lossFn, double rSquared) + { + return CreateInstance(l1, l2, + rms, lossFn, rSquared); + } + + private static T CreateInstance(params object[] args) + { + var type = typeof(T); + var instance = type.Assembly.CreateInstance( + type.FullName, false, + BindingFlags.Instance | BindingFlags.NonPublic, + null, args, null, null); + return (T)instance; + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj new file mode 100644 index 0000000000..6e5ff4ce14 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/Microsoft.ML.AutoML.Tests.csproj @@ -0,0 +1,35 @@ + + + + + + + + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + + + + + + + + + + + \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/PurposeInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/PurposeInferenceTests.cs new file mode 100644 index 0000000000..4689d15129 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/PurposeInferenceTests.cs @@ -0,0 +1,38 @@ +using System.Linq; +using Microsoft.ML.Data; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.AutoML.Test +{ + public class PurposeInferenceTests + { + [Fact] + public void PurposeInferenceHiddenColumnsTest() + { + var context = new MLContext(); + + // build basic data view + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn(DefaultColumnNames.Label, BooleanDataViewType.Instance); + schemaBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + IDataView data = DataViewTestFixture.BuildDummyDataView(schema); + + // normalize 'Features' column. this has the effect of creating 2 columns named + // 'Features' in the data view, the first of which gets marked as 'Hidden' + var normalizer = context.Transforms.NormalizeMinMax(DefaultColumnNames.Features); + data = normalizer.Fit(data).Transform(data); + + // infer purposes + var purposes = PurposeInference.InferPurposes(context, data, new ColumnInformation()); + + Assert.Equal(3, purposes.Count()); + Assert.Equal(ColumnPurpose.Label, purposes[0].Purpose); + // assert first 'Features' purpose (hidden column) is Ignore + Assert.Equal(ColumnPurpose.Ignore, purposes[1].Purpose); + // assert second 'Features' purpose is NumericFeature + Assert.Equal(ColumnPurpose.NumericFeature, purposes[2].Purpose); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/SplitUtilTests.cs b/test/Microsoft.ML.AutoML.Tests/SplitUtilTests.cs new file mode 100644 index 0000000000..8c0d3013bb --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/SplitUtilTests.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Linq; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class SplitUtilTests + { + /// + /// When there's only one row of data, assert that + /// attempted cross validation throws (all splits should have empty + /// train or test set). + /// + [Fact] + public void CrossValSplitThrowsWhenNotEnoughData() + { + var mlContext = new MLContext(); + var dataViewBuilder = new ArrayDataViewBuilder(mlContext); + dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, 0f); + dataViewBuilder.AddColumn("Label", NumberDataViewType.Single, 0f); + var dataView = dataViewBuilder.GetDataView(); + Assert.Throws(() => SplitUtil.CrossValSplit(mlContext, dataView, 10, null)); + } + + /// + /// When there are few rows of data, assert that + /// cross validation succeeds, but # of splits is less than 10 + /// (splits with empty train or test sets should not be returned from this API). + /// + [Fact] + public void CrossValSplitSmallDataView() + { + var mlContext = new MLContext(seed: 0); + var dataViewBuilder = new ArrayDataViewBuilder(mlContext); + dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, new float[9]); + dataViewBuilder.AddColumn("Label", NumberDataViewType.Single, new float[9]); + var dataView = dataViewBuilder.GetDataView(); + const int requestedNumSplits = 10; + var splits = SplitUtil.CrossValSplit(mlContext, dataView, requestedNumSplits, null); + Assert.True(splits.trainDatasets.Any()); + Assert.True(splits.trainDatasets.Count() < requestedNumSplits); + Assert.Equal(splits.trainDatasets.Count(), splits.validationDatasets.Count()); + } + + /// + /// Assert that with many rows of data, cross validation produces the requested + /// # of splits. + /// + [Fact] + public void CrossValSplitLargeDataView() + { + var mlContext = new MLContext(seed: 0); + var dataViewBuilder = new ArrayDataViewBuilder(mlContext); + dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, new float[10000]); + dataViewBuilder.AddColumn("Label", NumberDataViewType.Single, new float[10000]); + var dataView = dataViewBuilder.GetDataView(); + const int requestedNumSplits = 10; + var splits = SplitUtil.CrossValSplit(mlContext, dataView, requestedNumSplits, null); + Assert.True(splits.trainDatasets.Any()); + Assert.Equal(requestedNumSplits, splits.trainDatasets.Count()); + Assert.Equal(requestedNumSplits, splits.validationDatasets.Count()); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/SuggestedPipelineBuilderTests.cs b/test/Microsoft.ML.AutoML.Tests/SuggestedPipelineBuilderTests.cs new file mode 100644 index 0000000000..52f319abed --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/SuggestedPipelineBuilderTests.cs @@ -0,0 +1,83 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class SuggestedPipelineBuilderTests + { + private static MLContext _context = new MLContext(); + + [Fact] + public void TrainerWantsCaching() + { + TestPipelineBuilderCaching(BuildAveragedPerceptronTrainer(), + new CacheBeforeTrainer[] { CacheBeforeTrainer.On, CacheBeforeTrainer.Off, CacheBeforeTrainer.Auto }, + new[] { true, false, true }); + } + + [Fact] + public void TrainerDoesntWantCaching() + { + TestPipelineBuilderCaching(BuildLightGbmTrainer(), + new CacheBeforeTrainer[] { CacheBeforeTrainer.On, CacheBeforeTrainer.Off, CacheBeforeTrainer.Auto }, + new[] { true, false, false }); + } + + [Fact] + public void TrainerNeedsNormalization() + { + var pipeline = BuildSuggestedPipeline(BuildAveragedPerceptronTrainer()); + Assert.Equal(EstimatorName.Normalizing.ToString(), + pipeline.Transforms[0].PipelineNode.Name); + } + + [Fact] + public void TrainerNotNeedNormalization() + { + var pipeline = BuildSuggestedPipeline(BuildLightGbmTrainer()); + Assert.Equal(0, pipeline.Transforms.Count); + } + + private static void TestPipelineBuilderCaching( + SuggestedTrainer trainer, + CacheBeforeTrainer[] cacheBeforeTrainerSettings, + bool[] resultShouldHaveCaching) + { + for (var i = 0; i < cacheBeforeTrainerSettings.Length; i++) + { + var suggestedPipeline = BuildSuggestedPipeline(trainer, + cacheBeforeTrainerSettings[i]); + Assert.Equal(resultShouldHaveCaching[i], + suggestedPipeline.ToPipeline().CacheBeforeTrainer); + } + } + + private static SuggestedTrainer BuildAveragedPerceptronTrainer() + { + return new SuggestedTrainer(_context, + new AveragedPerceptronBinaryExtension(), + new ColumnInformation()); + } + + private static SuggestedTrainer BuildLightGbmTrainer() + { + return new SuggestedTrainer(_context, + new LightGbmBinaryExtension(), + new ColumnInformation()); + } + + private static SuggestedPipeline BuildSuggestedPipeline(SuggestedTrainer trainer, + CacheBeforeTrainer cacheBeforeTrainer = CacheBeforeTrainer.Auto) + { + return SuggestedPipelineBuilder.Build(_context, + new List(), + new List(), + trainer, cacheBeforeTrainer); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/SweeperTests.cs b/test/Microsoft.ML.AutoML.Tests/SweeperTests.cs new file mode 100644 index 0000000000..2cd9e2c051 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/SweeperTests.cs @@ -0,0 +1,170 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class SweeperTests + { + [Fact] + public void SmacQuickRunTest() + { + var numInitialPopulation = 10; + + var floatValueGenerator = new FloatValueGenerator(new FloatParamArguments() { Name = "float", Min = 1, Max = 1000 }); + var floatLogValueGenerator = new FloatValueGenerator(new FloatParamArguments() { Name = "floatLog", Min = 1, Max = 1000, LogBase = true }); + var longValueGenerator = new LongValueGenerator(new LongParamArguments() { Name = "long", Min = 1, Max = 1000 }); + var longLogValueGenerator = new LongValueGenerator(new LongParamArguments() { Name = "longLog", Min = 1, Max = 1000, LogBase = true }); + var discreteValueGeneator = new DiscreteValueGenerator(new DiscreteParamArguments() { Name = "discrete", Values = new[] { "200", "400", "600", "800" } }); + + var sweeper = new SmacSweeper(new MLContext(), new SmacSweeper.Arguments() + { + SweptParameters = new IValueGenerator[] { + floatValueGenerator, + floatLogValueGenerator, + longValueGenerator, + longLogValueGenerator, + discreteValueGeneator + }, + NumberInitialPopulation = numInitialPopulation + }); + + // sanity check grid + Assert.NotNull(floatValueGenerator[0].ValueText); + Assert.NotNull(floatLogValueGenerator[0].ValueText); + Assert.NotNull(longValueGenerator[0].ValueText); + Assert.NotNull(longLogValueGenerator[0].ValueText); + Assert.NotNull(discreteValueGeneator[0].ValueText); + + List results = new List(); + + RunResult bestResult = null; + for (var i = 0; i < numInitialPopulation + 1; i++) + { + ParameterSet[] pars = sweeper.ProposeSweeps(1, results); + + foreach (ParameterSet p in pars) + { + float x1 = float.Parse(p["float"].ValueText); + float x2 = float.Parse(p["floatLog"].ValueText); + long x3 = long.Parse(p["long"].ValueText); + long x4 = long.Parse(p["longLog"].ValueText); + int x5 = int.Parse(p["discrete"].ValueText); + + double metric = x1 + x2 + x3 + x4 + x5; + + RunResult result = new RunResult(p, metric, true); + if (bestResult == null || bestResult.MetricValue < metric) + { + bestResult = result; + } + results.Add(result); + + Console.WriteLine($"{metric}\t{x1},{x2}"); + } + + } + + Console.WriteLine($"Best: {bestResult.MetricValue}"); + + Assert.NotNull(bestResult); + Assert.True(bestResult.MetricValue > 0); + } + + [Fact(Skip = "This test is too slow to run as part of automation.")] + public void Smac4ParamsConvergenceTest() + { + var sweeper = new SmacSweeper(new MLContext(), new SmacSweeper.Arguments() + { + SweptParameters = new INumericValueGenerator[] { + new FloatValueGenerator(new FloatParamArguments() { Name = "x1", Min = 1, Max = 1000}), + new FloatValueGenerator(new FloatParamArguments() { Name = "x2", Min = 1, Max = 1000}), + new FloatValueGenerator(new FloatParamArguments() { Name = "x3", Min = 1, Max = 1000}), + new FloatValueGenerator(new FloatParamArguments() { Name = "x4", Min = 1, Max = 1000}), + }, + }); + + List results = new List(); + + RunResult bestResult = null; + for (var i = 0; i < 300; i++) + { + ParameterSet[] pars = sweeper.ProposeSweeps(1, results); + + // if run converged, break + if (pars == null) + { + break; + } + + foreach (ParameterSet p in pars) + { + float x1 = (p["x1"] as FloatParameterValue).Value; + float x2 = (p["x2"] as FloatParameterValue).Value; + float x3 = (p["x3"] as FloatParameterValue).Value; + float x4 = (p["x4"] as FloatParameterValue).Value; + + double metric = -200 * (Math.Abs(100 - x1) + + Math.Abs(300 - x2) + + Math.Abs(500 - x3) + + Math.Abs(700 - x4)); + + RunResult result = new RunResult(p, metric, true); + if (bestResult == null || bestResult.MetricValue < metric) + { + bestResult = result; + } + results.Add(result); + + Console.WriteLine($"{metric}\t{x1},{x2},{x3},{x4}"); + } + + } + + Console.WriteLine($"Best: {bestResult.MetricValue}"); + } + + [Fact(Skip = "This test is too slow to run as part of automation.")] + public void Smac2ParamsConvergenceTest() + { + var sweeper = new SmacSweeper(new MLContext(), new SmacSweeper.Arguments() + { + SweptParameters = new INumericValueGenerator[] { + new FloatValueGenerator(new FloatParamArguments() { Name = "foo", Min = 1, Max = 5}), + new LongValueGenerator(new LongParamArguments() { Name = "bar", Min = 1, Max = 1000, LogBase = true }) + }, + }); + + Random rand = new Random(0); + List results = new List(); + + int count = 0; + while (true) + { + ParameterSet[] pars = sweeper.ProposeSweeps(1, results); + if(pars == null) + { + break; + } + foreach (ParameterSet p in pars) + { + float foo = 0; + long bar = 0; + + foo = (p["foo"] as FloatParameterValue).Value; + bar = (p["bar"] as LongParameterValue).Value; + + double metric = ((5 - Math.Abs(4 - foo)) * 200) + (1001 - Math.Abs(33 - bar)) + rand.Next(1, 20); + results.Add(new RunResult(p, metric, true)); + count++; + Console.WriteLine("{0}--{1}--{2}--{3}", count, foo, bar, metric); + } + } + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/BinaryDatasetWithBoolColumn.txt b/test/Microsoft.ML.AutoML.Tests/TestData/BinaryDatasetWithBoolColumn.txt new file mode 100644 index 0000000000..7fc6e787df --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/BinaryDatasetWithBoolColumn.txt @@ -0,0 +1,5 @@ +Label,Bool +0,1 +0,0 +1,1 +1,0 \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithDefaultColumnNames.txt b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithDefaultColumnNames.txt new file mode 100644 index 0000000000..26aa3a2102 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithDefaultColumnNames.txt @@ -0,0 +1,4 @@ +Label,Weight,Name,Features,FeatureContributions,Feature1 +0,1,GUID1,1,1,1 +0,1,GUID2,1,1,1 +1,1,GUID3,1,1,1 \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithEmptyColumn.txt b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithEmptyColumn.txt new file mode 100644 index 0000000000..7033743b5b --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/DatasetWithEmptyColumn.txt @@ -0,0 +1,4 @@ +Label,Feature1,Empty +0,2, +0,4, +1,1, \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/NameColumnIsOnlyFeatureDataset.txt b/test/Microsoft.ML.AutoML.Tests/TestData/NameColumnIsOnlyFeatureDataset.txt new file mode 100644 index 0000000000..3e436a9ae6 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/NameColumnIsOnlyFeatureDataset.txt @@ -0,0 +1,103 @@ +Label,Username +0,a0 +0,a1 +0,a2 +0,a3 +0,a4 +0,a5 +0,a6 +0,a7 +0,a8 +0,a9 +0,a10 +0,a11 +0,a12 +0,a13 +0,a14 +0,a15 +0,a16 +0,a17 +0,a18 +0,a19 +0,a20 +0,a21 +0,a22 +0,a23 +0,a24 +0,a25 +0,a26 +0,a27 +0,a28 +0,a29 +0,a30 +0,a31 +0,a32 +0,a33 +0,a34 +0,a35 +0,a36 +0,a37 +0,a38 +0,a39 +0,a40 +0,a41 +0,a42 +0,a43 +0,a44 +0,a45 +0,a46 +0,a47 +0,a48 +0,a49 +0,a50 +1,b0 +1,b1 +1,b2 +1,b3 +1,b4 +1,b5 +1,b6 +1,b7 +1,b8 +1,b9 +1,b10 +1,b11 +1,b12 +1,b13 +1,b14 +1,b15 +1,b16 +1,b17 +1,b18 +1,b19 +1,b20 +1,b21 +1,b22 +1,b23 +1,b24 +1,b25 +1,b26 +1,b27 +1,b28 +1,b29 +1,b30 +1,b31 +1,b32 +1,b33 +1,b34 +1,b35 +1,b36 +1,b37 +1,b38 +1,b39 +1,b40 +1,b41 +1,b42 +1,b43 +1,b44 +1,b45 +1,b46 +1,b47 +1,b48 +1,b49 +1,b50 \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/TestData/TrivialMulticlassDataset.txt b/test/Microsoft.ML.AutoML.Tests/TestData/TrivialMulticlassDataset.txt new file mode 100644 index 0000000000..c9566415b6 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TestData/TrivialMulticlassDataset.txt @@ -0,0 +1,181 @@ +Target Row Column +1 14 20 +1 19 26 +3 17 4 +1 10 20 +1 3 5 +1 7 5 +1 18 36 +2 1 36 +2 1 38 +3 17 1 +2 6 26 +2 9 30 +3 13 8 +2 7 33 +2 8 30 +3 10 1 +1 18 25 +1 13 12 +1 3 2 +2 8 28 +1 11 24 +2 3 28 +2 1 16 +1 9 7 +1 15 16 +3 19 4 +1 1 8 +1 8 0 +1 10 34 +1 18 37 +2 1 17 +2 8 39 +1 17 30 +2 1 27 +2 0 38 +1 11 16 +3 19 3 +1 7 8 +1 13 13 +1 19 31 +3 16 1 +1 5 1 +2 6 11 +1 9 5 +3 10 6 +1 1 2 +2 6 30 +2 7 15 +1 17 21 +1 18 23 +3 10 7 +2 5 39 +2 2 27 +3 12 6 +3 11 4 +1 9 3 +1 12 22 +2 8 19 +2 1 14 +1 11 11 +1 10 36 +3 12 4 +1 15 21 +1 17 37 +1 6 3 +2 3 18 +1 10 10 +1 11 33 +1 18 19 +2 7 35 +3 10 2 +1 12 30 +1 12 26 +2 1 31 +2 5 21 +2 1 11 +1 7 3 +2 8 36 +3 10 4 +1 18 26 +2 8 10 +1 10 22 +1 15 14 +3 16 0 +2 0 30 +2 3 34 +3 13 9 +1 0 2 +1 15 36 +1 15 23 +1 10 30 +2 6 20 +2 9 24 +2 9 35 +1 7 6 +2 7 39 +2 5 20 +3 12 8 +2 9 12 +1 17 25 +1 12 33 +2 6 19 +1 17 10 +2 4 35 +1 15 31 +3 12 7 +1 17 16 +2 1 19 +2 3 25 +1 16 30 +1 19 30 +1 5 4 +2 6 10 +1 18 20 +1 13 26 +2 3 39 +2 2 20 +1 4 7 +2 3 33 +1 16 20 +2 1 21 +3 15 2 +3 19 2 +1 12 10 +2 5 37 +2 1 32 +3 18 6 +1 2 1 +1 16 21 +2 1 23 +1 17 33 +2 5 11 +2 3 14 +1 11 12 +1 13 20 +1 19 38 +1 15 10 +2 8 11 +3 11 0 +1 18 10 +1 19 24 +1 13 11 +2 4 23 +1 16 26 +1 7 7 +1 17 29 +1 18 30 +1 13 10 +2 6 21 +1 19 32 +2 7 12 +1 12 28 +2 2 11 +1 12 15 +2 8 32 +3 15 9 +3 16 5 +1 9 1 +1 19 28 +3 16 3 +1 15 17 +2 7 38 +1 16 38 +1 14 26 +1 10 26 +1 10 37 +3 18 5 +2 5 27 +2 2 22 +1 11 39 +1 16 36 +1 0 9 +2 5 19 +1 18 28 +1 12 13 +1 17 17 +1 8 1 +2 6 15 +3 14 4 +1 1 4 \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/TextFileSampleTests.cs b/test/Microsoft.ML.AutoML.Tests/TextFileSampleTests.cs new file mode 100644 index 0000000000..a763886cd0 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TextFileSampleTests.cs @@ -0,0 +1,50 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using System.Text; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class TextFileSampleTests + { + [Fact] + public void CanParseLargeRandomStream() + { + using (var stream = new MemoryStream()) + { + const int numRows = 100000; + const int rowSize = 100; + + var eol = Encoding.UTF8.GetBytes("\r\n"); + + for (var i = 0; i < numRows; i++) + { + var row = new byte[rowSize]; + AutoMlUtils.Random.Value.NextBytes(row); + + // ensure byte array has no 0s, so text file sampler doesn't + // think file is encoded with UTF-16 or UTF-32 without a BOM + for (var k = 0; k < row.Length; k++) + { + if(row[k] == 0) + { + row[k] = 1; + } + } + stream.Write(row, 0, rowSize); + stream.Write(eol, 0, eol.Length); + } + + stream.Seek(0, SeekOrigin.Begin); + + var sample = TextFileSample.CreateFromFullStream(stream); + Assert.NotNull(sample); + Assert.True(sample.FullFileSize > 0); + } + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/TrainerExtensionsTests.cs b/test/Microsoft.ML.AutoML.Tests/TrainerExtensionsTests.cs new file mode 100644 index 0000000000..1c1ac1f288 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TrainerExtensionsTests.cs @@ -0,0 +1,311 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class TrainerExtensionsTests + { + [Fact] + public void TrainerExtensionInstanceTests() + { + var context = new MLContext(); + var columnInfo = new ColumnInformation(); + var trainerNames = Enum.GetValues(typeof(TrainerName)).Cast() + .Except(new[] { TrainerName.Ova }); + foreach (var trainerName in trainerNames) + { + var extension = TrainerExtensionCatalog.GetTrainerExtension(trainerName); + var sweepParams = extension.GetHyperparamSweepRanges(); + Assert.NotNull(sweepParams); + foreach (var sweepParam in sweepParams) + { + sweepParam.RawValue = 1; + } + var instance = extension.CreateInstance(context, sweepParams, columnInfo); + Assert.NotNull(instance); + var pipelineNode = extension.CreatePipelineNode(null, columnInfo); + Assert.NotNull(pipelineNode); + } + } + + [Fact] + public void BuildLightGbmPipelineNode() + { + var sweepParams = SweepableParams.BuildLightGbmParams(); + foreach (var sweepParam in sweepParams) + { + sweepParam.RawValue = 1; + } + + var pipelineNode = new LightGbmBinaryExtension().CreatePipelineNode(sweepParams, new ColumnInformation()); + + var expectedJson = @"{ + ""Name"": ""LightGbmBinary"", + ""NodeType"": ""Trainer"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Score"" + ], + ""Properties"": { + ""NumberOfIterations"": 20, + ""LearningRate"": 1, + ""NumberOfLeaves"": 1, + ""MinimumExampleCountPerLeaf"": 10, + ""UseCategoricalSplit"": false, + ""HandleMissingValue"": false, + ""MinimumExampleCountPerGroup"": 50, + ""MaximumCategoricalSplitPointCount"": 16, + ""CategoricalSmoothing"": 10, + ""L2CategoricalRegularization"": 0.5, + ""Booster"": { + ""Name"": ""GradientBooster.Options"", + ""Properties"": { + ""L2Regularization"": 0.5, + ""L1Regularization"": 0.5 + } + }, + ""LabelColumnName"": ""Label"" + } +}"; + Util.AssertObjectMatchesJson(expectedJson, pipelineNode); + } + + [Fact] + public void BuildSdcaPipelineNode() + { + var sweepParams = SweepableParams.BuildSdcaParams(); + foreach (var sweepParam in sweepParams) + { + sweepParam.RawValue = 1; + } + + var pipelineNode = new SdcaLogisticRegressionBinaryExtension().CreatePipelineNode(sweepParams, new ColumnInformation()); + var expectedJson = @"{ + ""Name"": ""SdcaLogisticRegressionBinary"", + ""NodeType"": ""Trainer"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Score"" + ], + ""Properties"": { + ""L2Regularization"": 1E-07, + ""L1Regularization"": 0.0, + ""ConvergenceTolerance"": 0.01, + ""MaximumNumberOfIterations"": 10, + ""Shuffle"": true, + ""BiasLearningRate"": 0.01, + ""LabelColumnName"": ""Label"" + } +}"; + Util.AssertObjectMatchesJson(expectedJson, pipelineNode); + } + + [Fact] + public void BuildLightGbmPipelineNodeDefaultParams() + { + var pipelineNode = new LightGbmBinaryExtension().CreatePipelineNode( + new List(), + new ColumnInformation()); + var expectedJson = @"{ + ""Name"": ""LightGbmBinary"", + ""NodeType"": ""Trainer"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Score"" + ], + ""Properties"": { + ""LabelColumnName"": ""Label"" + } +}"; + Util.AssertObjectMatchesJson(expectedJson, pipelineNode); + } + + [Fact] + public void BuildPipelineNodeWithCustomColumns() + { + var columnInfo = new ColumnInformation() + { + LabelColumnName = "L", + ExampleWeightColumnName = "W" + }; + var sweepParams = SweepableParams.BuildFastForestParams(); + foreach (var sweepParam in sweepParams) + { + sweepParam.RawValue = 1; + } + + var pipelineNode = new FastForestBinaryExtension().CreatePipelineNode(sweepParams, columnInfo); + var expectedJson = @"{ + ""Name"": ""FastForestBinary"", + ""NodeType"": ""Trainer"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Score"" + ], + ""Properties"": { + ""NumberOfLeaves"": 1, + ""MinimumExampleCountPerLeaf"": 10, + ""NumberOfTrees"": 100, + ""LabelColumnName"": ""L"", + ""ExampleWeightColumnName"": ""W"" + } +}"; + Util.AssertObjectMatchesJson(expectedJson, pipelineNode); + } + + [Fact] + public void BuildDefaultAveragedPerceptronPipelineNode() + { + var pipelineNode = new AveragedPerceptronBinaryExtension().CreatePipelineNode(null, new ColumnInformation() { LabelColumnName = "L" }); + var expectedJson = @"{ + ""Name"": ""AveragedPerceptronBinary"", + ""NodeType"": ""Trainer"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Score"" + ], + ""Properties"": { + ""LabelColumnName"": ""L"", + ""NumberOfIterations"": 10 + } +}"; + Util.AssertObjectMatchesJson(expectedJson, pipelineNode); + } + + [Fact] + public void BuildOvaPipelineNode() + { + var pipelineNode = new FastForestOvaExtension().CreatePipelineNode(null, new ColumnInformation()); + var expectedJson = @"{ + ""Name"": ""Ova"", + ""NodeType"": ""Trainer"", + ""InColumns"": null, + ""OutColumns"": null, + ""Properties"": { + ""LabelColumnName"": ""Label"", + ""BinaryTrainer"": { + ""Name"": ""FastForestBinary"", + ""NodeType"": ""Trainer"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Score"" + ], + ""Properties"": { + ""LabelColumnName"": ""Label"" + } + } + } +}"; + Util.AssertObjectMatchesJson(expectedJson, pipelineNode); + } + + [Fact] + public void BuildParameterSetLightGbm() + { + var props = new Dictionary() + { + {"NumberOfIterations", 1 }, + {"LearningRate", 1 }, + {"Booster", new CustomProperty() { + Name = "GradientBooster.Options", + Properties = new Dictionary() + { + {"L2Regularization", 1 }, + {"L1Regularization", 1 }, + } + } }, + }; + var binaryParams = TrainerExtensionUtil.BuildParameterSet(TrainerName.LightGbmBinary, props); + var multiParams = TrainerExtensionUtil.BuildParameterSet(TrainerName.LightGbmMulti, props); + var regressionParams = TrainerExtensionUtil.BuildParameterSet(TrainerName.LightGbmRegression, props); + + foreach (var paramSet in new ParameterSet[] { binaryParams, multiParams, regressionParams }) + { + Assert.Equal(4, paramSet.Count); + Assert.Equal("1", paramSet["NumberOfIterations"].ValueText); + Assert.Equal("1", paramSet["LearningRate"].ValueText); + Assert.Equal("1", paramSet["L2Regularization"].ValueText); + Assert.Equal("1", paramSet["L1Regularization"].ValueText); + } + } + + [Fact] + public void BuildParameterSetSdca() + { + var props = new Dictionary() + { + {"LearningRate", 1 }, + }; + + var sdcaParams = TrainerExtensionUtil.BuildParameterSet(TrainerName.SdcaLogisticRegressionBinary, props); + + Assert.Equal(1, sdcaParams.Count); + Assert.Equal("1", sdcaParams["LearningRate"].ValueText); + } + + [Fact] + public void PublicToPrivateTrainerNamesBinaryTest() + { + var publicNames = Enum.GetValues(typeof(BinaryClassificationTrainer)).Cast(); + var internalNames = TrainerExtensionUtil.GetTrainerNames(publicNames); + Assert.Equal(publicNames.Distinct().Count(), internalNames.Distinct().Count()); + } + + [Fact] + public void PublicToPrivateTrainerNamesMultiTest() + { + var publicNames = Enum.GetValues(typeof(MulticlassClassificationTrainer)).Cast(); + var internalNames = TrainerExtensionUtil.GetTrainerNames(publicNames); + Assert.Equal(publicNames.Distinct().Count(), internalNames.Distinct().Count()); + } + + [Fact] + public void PublicToPrivateTrainerNamesRegressionTest() + { + var publicNames = Enum.GetValues(typeof(RegressionTrainer)).Cast(); + var internalNames = TrainerExtensionUtil.GetTrainerNames(publicNames); + Assert.Equal(publicNames.Distinct().Count(), internalNames.Distinct().Count()); + } + + [Fact] + public void PublicToPrivateTrainerNamesNullTest() + { + var internalNames = TrainerExtensionUtil.GetTrainerNames(null as IEnumerable); + Assert.Null(internalNames); + } + + [Fact] + public void AllowedTrainersWhitelistNullTest() + { + var trainers = RecipeInference.AllowedTrainers(new MLContext(), TaskKind.BinaryClassification, new ColumnInformation(), null); + Assert.True(trainers.Any()); + } + + [Fact] + public void AllowedTrainersWhitelistTest() + { + var whitelist = new[] { TrainerName.AveragedPerceptronBinary, TrainerName.FastForestBinary }; + var trainers = RecipeInference.AllowedTrainers(new MLContext(), TaskKind.BinaryClassification, new ColumnInformation(), whitelist); + Assert.Equal(whitelist.Count(), trainers.Count()); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/TransformInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/TransformInferenceTests.cs new file mode 100644 index 0000000000..69284fd338 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TransformInferenceTests.cs @@ -0,0 +1,757 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class TransformInferenceTests + { + [Fact] + public void TransformInferenceNumAndCatCols() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Categorical1", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(7, null)), + new DatasetColumnInfo("Categorical2", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(7, null)), + new DatasetColumnInfo("LargeCat1", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(500, null)), + new DatasetColumnInfo("LargeCat2", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(500, null)), + }, @"[ + { + ""Name"": ""OneHotEncoding"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Categorical1"", + ""Categorical2"" + ], + ""OutColumns"": [ + ""Categorical1"", + ""Categorical2"" + ], + ""Properties"": {} + }, + { + ""Name"": ""OneHotHashEncoding"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""LargeCat1"", + ""LargeCat2"" + ], + ""OutColumns"": [ + ""LargeCat1"", + ""LargeCat2"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Categorical1"", + ""Categorical2"", + ""LargeCat1"", + ""LargeCat2"", + ""Numeric1"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceNumCatAndFeatCols() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Categorical1", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(7, null)), + new DatasetColumnInfo("Categorical2", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(7, null)), + new DatasetColumnInfo("LargeCat1", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(500, null)), + new DatasetColumnInfo("LargeCat2", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(500, null)), + }, @"[ + { + ""Name"": ""OneHotEncoding"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Categorical1"", + ""Categorical2"" + ], + ""OutColumns"": [ + ""Categorical1"", + ""Categorical2"" + ], + ""Properties"": {} + }, + { + ""Name"": ""OneHotHashEncoding"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""LargeCat1"", + ""LargeCat2"" + ], + ""OutColumns"": [ + ""LargeCat1"", + ""LargeCat2"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Categorical1"", + ""Categorical2"", + ""LargeCat1"", + ""LargeCat2"", + ""Features"", + ""Numeric1"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceCatAndFeatCols() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Categorical1", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(7, null)), + new DatasetColumnInfo("LargeCat1", TextDataViewType.Instance, ColumnPurpose.CategoricalFeature, new ColumnDimensions(500, null)), + }, @"[ + { + ""Name"": ""OneHotEncoding"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Categorical1"" + ], + ""OutColumns"": [ + ""Categorical1"" + ], + ""Properties"": {} + }, + { + ""Name"": ""OneHotHashEncoding"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""LargeCat1"" + ], + ""OutColumns"": [ + ""LargeCat1"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Categorical1"", + ""LargeCat1"", + ""Features"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceNumericCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, + @"[ + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceNumericCols() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Numeric2", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Numeric1"", + ""Numeric2"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceFeatColScalar() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Features"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceFeatColVector() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, new VectorDataViewType(NumberDataViewType.Single), ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[]"); + } + + [Fact] + public void NumericAndFeatCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Numeric", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Features"", + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void NumericScalarCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void NumericVectorCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric", new VectorDataViewType(NumberDataViewType.Single), ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ColumnCopying"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceTextCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Text", TextDataViewType.Instance, ColumnPurpose.TextFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""TextFeaturizing"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Text"" + ], + ""OutColumns"": [ + ""Text_tf"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnCopying"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Text_tf"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceTextAndFeatCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Text", TextDataViewType.Instance, ColumnPurpose.TextFeature, new ColumnDimensions(null, null)), + }, + @"[ + { + ""Name"": ""TextFeaturizing"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Text"" + ], + ""OutColumns"": [ + ""Text_tf"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Text_tf"", + ""Features"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceBoolCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Bool", BooleanDataViewType.Instance, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""TypeConverting"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Bool"" + ], + ""OutColumns"": [ + ""Bool"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Bool"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceBoolAndNumCols() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Bool", BooleanDataViewType.Instance, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""TypeConverting"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Bool"" + ], + ""OutColumns"": [ + ""Bool"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Bool"", + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceBoolAndFeatCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Bool", BooleanDataViewType.Instance, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""TypeConverting"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Bool"" + ], + ""OutColumns"": [ + ""Bool"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Bool"", + ""Features"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceNumericMissingCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Missing", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, true)), + new DatasetColumnInfo("Numeric", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, false)), + }, @"[ + { + ""Name"": ""MissingValueIndicating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing"" + ], + ""OutColumns"": [ + ""Missing_MissingIndicator"" + ], + ""Properties"": {} + }, + { + ""Name"": ""TypeConverting"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing_MissingIndicator"" + ], + ""OutColumns"": [ + ""Missing_MissingIndicator"" + ], + ""Properties"": {} + }, + { + ""Name"": ""MissingValueReplacing"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing"" + ], + ""OutColumns"": [ + ""Missing"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing_MissingIndicator"", + ""Missing"", + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceNumericMissingCols() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Missing1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, true)), + new DatasetColumnInfo("Missing2", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, true)), + new DatasetColumnInfo("Numeric", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, false)), + }, @"[ + { + ""Name"": ""MissingValueIndicating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing1"", + ""Missing2"" + ], + ""OutColumns"": [ + ""Missing1_MissingIndicator"", + ""Missing2_MissingIndicator"" + ], + ""Properties"": {} + }, + { + ""Name"": ""TypeConverting"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing1_MissingIndicator"", + ""Missing2_MissingIndicator"" + ], + ""OutColumns"": [ + ""Missing1_MissingIndicator"", + ""Missing2_MissingIndicator"" + ], + ""Properties"": {} + }, + { + ""Name"": ""MissingValueReplacing"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing1"", + ""Missing2"" + ], + ""OutColumns"": [ + ""Missing1"", + ""Missing2"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing1_MissingIndicator"", + ""Missing2_MissingIndicator"", + ""Missing1"", + ""Missing2"", + ""Numeric"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceIgnoreCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.Ignore, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Numeric2", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Numeric2"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformInferenceDefaultLabelCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, new VectorDataViewType(NumberDataViewType.Single), ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo(DefaultColumnNames.Label, NumberDataViewType.Single, ColumnPurpose.Label, new ColumnDimensions(null, null)), + }, @"[]"); + } + + [Fact] + public void TransformInferenceCustomLabelCol() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, new VectorDataViewType(NumberDataViewType.Single), ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("CustomLabel", NumberDataViewType.Single, ColumnPurpose.Label, new ColumnDimensions(null, null)), + }, @"[]"); + } + + [Fact] + public void TransformInferenceCustomTextLabelColMulticlass() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo(DefaultColumnNames.Features, new VectorDataViewType(NumberDataViewType.Single), ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("CustomLabel", TextDataViewType.Instance, ColumnPurpose.Label, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""ValueToKeyMapping"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""CustomLabel"" + ], + ""OutColumns"": [ + ""CustomLabel"" + ], + ""Properties"": {} + } +]", TaskKind.MulticlassClassification); + } + + [Fact] + public void TransformInferenceMissingNameCollision() + { + TransformInferenceTestCore(new[] + { + new DatasetColumnInfo("Missing", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, true)), + new DatasetColumnInfo("Missing_MissingIndicator", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, false)), + new DatasetColumnInfo("Missing_MissingIndicator0", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, false)), + }, @"[ + { + ""Name"": ""MissingValueIndicating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing"" + ], + ""OutColumns"": [ + ""Missing_MissingIndicator1"" + ], + ""Properties"": {} + }, + { + ""Name"": ""TypeConverting"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing_MissingIndicator1"" + ], + ""OutColumns"": [ + ""Missing_MissingIndicator1"" + ], + ""Properties"": {} + }, + { + ""Name"": ""MissingValueReplacing"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing"" + ], + ""OutColumns"": [ + ""Missing"" + ], + ""Properties"": {} + }, + { + ""Name"": ""ColumnConcatenating"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""Missing_MissingIndicator1"", + ""Missing"", + ""Missing_MissingIndicator"", + ""Missing_MissingIndicator0"" + ], + ""OutColumns"": [ + ""Features"" + ], + ""Properties"": {} + } +]"); + } + + private static void TransformInferenceTestCore( + DatasetColumnInfo[] columns, + string expectedJson, + TaskKind task = TaskKind.BinaryClassification) + { + var transforms = TransformInferenceApi.InferTransforms(new MLContext(), task, columns); + TestApplyTransformsToRealDataView(transforms, columns); + var pipelineNodes = transforms.Select(t => t.PipelineNode); + Util.AssertObjectMatchesJson(expectedJson, pipelineNodes); + } + + private static void TestApplyTransformsToRealDataView(IEnumerable transforms, + IEnumerable columns) + { + // create a dummy data view from input columns + var data = DataViewTestFixture.BuildDummyDataView(columns); + + // iterate thru suggested transforms and apply it to a real data view + foreach (var transform in transforms.Select(t => t.Estimator)) + { + data = transform.Fit(data).Transform(data); + } + + // assert Features column of type 'R4' exists + var featuresCol = data.Schema.GetColumnOrNull(DefaultColumnNames.Features); + Assert.NotNull(featuresCol); + Assert.True(featuresCol.Value.Type.IsVector()); + Assert.Equal(NumberDataViewType.Single, featuresCol.Value.Type.GetItemType()); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/TransformPostTrainerInferenceTests.cs b/test/Microsoft.ML.AutoML.Tests/TransformPostTrainerInferenceTests.cs new file mode 100644 index 0000000000..558accf9ce --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/TransformPostTrainerInferenceTests.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class TransformPostTrainerInferenceTests + { + [Fact] + public void TransformPostTrainerMulticlassNonKeyLabel() + { + TransformPostTrainerInferenceTestCore(TaskKind.MulticlassClassification, + new[] + { + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Label", NumberDataViewType.Single, ColumnPurpose.Label, new ColumnDimensions(null, null)), + }, @"[ + { + ""Name"": ""KeyToValueMapping"", + ""NodeType"": ""Transform"", + ""InColumns"": [ + ""PredictedLabel"" + ], + ""OutColumns"": [ + ""PredictedLabel"" + ], + ""Properties"": {} + } +]"); + } + + [Fact] + public void TransformPostTrainerBinaryLabel() + { + TransformPostTrainerInferenceTestCore(TaskKind.BinaryClassification, + new[] + { + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Label", NumberDataViewType.Single, ColumnPurpose.Label, new ColumnDimensions(null, null)), + }, @"[]"); + } + + [Fact] + public void TransformPostTrainerMulticlassKeyLabel() + { + TransformPostTrainerInferenceTestCore(TaskKind.MulticlassClassification, + new[] + { + new DatasetColumnInfo("Numeric1", NumberDataViewType.Single, ColumnPurpose.NumericFeature, new ColumnDimensions(null, null)), + new DatasetColumnInfo("Label", new KeyDataViewType(typeof(uint), 3), ColumnPurpose.Label, new ColumnDimensions(null, null)), + }, @"[]"); + } + + private static void TransformPostTrainerInferenceTestCore( + TaskKind task, + DatasetColumnInfo[] columns, + string expectedJson) + { + var transforms = TransformInferenceApi.InferTransformsPostTrainer(new MLContext(), task, columns); + var pipelineNodes = transforms.Select(t => t.PipelineNode); + Util.AssertObjectMatchesJson(expectedJson, pipelineNodes); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/UserInputValidationTests.cs b/test/Microsoft.ML.AutoML.Tests/UserInputValidationTests.cs new file mode 100644 index 0000000000..d56e1026f4 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/UserInputValidationTests.cs @@ -0,0 +1,301 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.ML.Data; +using Xunit; + +namespace Microsoft.ML.AutoML.Test +{ + + public class UserInputValidationTests + { + private static readonly IDataView Data = DatasetUtil.GetUciAdultDataView(); + + [Fact] + public void ValidateExperimentExecuteNullTrainData() + { + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(null, new ColumnInformation(), null, TaskKind.Regression)); + Assert.StartsWith("Training data cannot be null", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteNullLabel() + { + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(Data, + new ColumnInformation() { LabelColumnName = null }, null, TaskKind.Regression)); + + Assert.Equal("Provided label column cannot be null", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteLabelNotInTrain() + { + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(Data, + new ColumnInformation() { LabelColumnName = "L" }, null, TaskKind.Regression)); + + Assert.Equal("Provided label column 'L' not found in training data.", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteNumericColNotInTrain() + { + var columnInfo = new ColumnInformation(); + columnInfo.NumericColumnNames.Add("N"); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(Data, columnInfo, null, TaskKind.Regression)); + Assert.Equal("Provided label column 'Label' was of type Boolean, but only type Single is allowed.", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteNullNumericCol() + { + var columnInfo = new ColumnInformation(); + columnInfo.NumericColumnNames.Add(null); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(Data, columnInfo, null, TaskKind.Regression)); + Assert.Equal("Null column string was specified as numeric in column information", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteDuplicateCol() + { + var columnInfo = new ColumnInformation(); + columnInfo.NumericColumnNames.Add(DefaultColumnNames.Label); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(Data, columnInfo, null, TaskKind.Regression)); + } + + [Fact] + public void ValidateExperimentExecuteArgsTrainValidColCountMismatch() + { + var context = new MLContext(); + + var trainDataBuilder = new ArrayDataViewBuilder(context); + trainDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); + trainDataBuilder.AddColumn("1", new string[] { "1" }); + var trainData = trainDataBuilder.GetDataView(); + + var validDataBuilder = new ArrayDataViewBuilder(context); + validDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); + var validData = validDataBuilder.GetDataView(); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainData, + new ColumnInformation() { LabelColumnName = "0" }, validData, TaskKind.Regression)); + Assert.StartsWith("Training data and validation data schemas do not match. Train data has '2' columns,and validation data has '1' columns.", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteArgsTrainValidColNamesMismatch() + { + var context = new MLContext(); + + var trainDataBuilder = new ArrayDataViewBuilder(context); + trainDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); + trainDataBuilder.AddColumn("1", new string[] { "1" }); + var trainData = trainDataBuilder.GetDataView(); + + var validDataBuilder = new ArrayDataViewBuilder(context); + validDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); + validDataBuilder.AddColumn("2", new string[] { "2" }); + var validData = validDataBuilder.GetDataView(); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainData, + new ColumnInformation() { LabelColumnName = "0" }, validData, TaskKind.Regression)); + Assert.StartsWith("Training data and validation data schemas do not match. Column '1' exsits in train data, but not in validation data.", ex.Message); + } + + [Fact] + public void ValidateExperimentExecuteArgsTrainValidColTypeMismatch() + { + var context = new MLContext(); + + var trainDataBuilder = new ArrayDataViewBuilder(context); + trainDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); + trainDataBuilder.AddColumn("1", new string[] { "1" }); + var trainData = trainDataBuilder.GetDataView(); + + var validDataBuilder = new ArrayDataViewBuilder(context); + validDataBuilder.AddColumn("0", NumberDataViewType.Single, new float[] { 1 }); + validDataBuilder.AddColumn("1", NumberDataViewType.Single, new float[] { 1 }); + var validData = validDataBuilder.GetDataView(); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainData, + new ColumnInformation() { LabelColumnName = "0" }, validData, TaskKind.Regression)); + Assert.StartsWith("Training data and validation data schemas do not match. Column '1' is of type String in train data, and type Single in validation data.", ex.Message); + } + + [Fact] + public void ValidateInferColumnsArgsNullPath() + { + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateInferColumnsArgs(null, "Label")); + Assert.StartsWith("Provided path cannot be null", ex.Message); + } + + [Fact] + public void ValidateInferColumnsArgsPathNotExist() + { + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateInferColumnsArgs("idontexist", "Label")); + Assert.StartsWith("File 'idontexist' does not exist", ex.Message); + } + + [Fact] + public void ValidateInferColumnsArgsEmptyFile() + { + const string emptyFilePath = "empty"; + File.Create(emptyFilePath).Dispose(); + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateInferColumnsArgs(emptyFilePath, "Label")); + Assert.StartsWith("File at path 'empty' cannot be empty", ex.Message); + } + + [Fact] + public void ValidateInferColsPath() + { + UserInputValidationUtil.ValidateInferColumnsArgs(DatasetUtil.DownloadUciAdultDataset()); + } + + [Fact] + public void ValidateFeaturesColInvalidType() + { + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Double); + schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + var dataView = DataViewTestFixture.BuildDummyDataView(schema); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, new ColumnInformation(), null, TaskKind.Regression)); + Assert.StartsWith("Features column must be of data type Single", ex.Message); + } + + [Fact] + public void ValidateTextColumnNotText() + { + const string TextPurposeColName = "TextColumn"; + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single); + schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); + schemaBuilder.AddColumn(TextPurposeColName, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + var dataView = DataViewTestFixture.BuildDummyDataView(schema); + + var columnInfo = new ColumnInformation(); + columnInfo.TextColumnNames.Add(TextPurposeColName); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, columnInfo, null, TaskKind.Regression)); + Assert.Equal("Provided text column 'TextColumn' was of type Single, but only type String is allowed.", ex.Message); + } + + [Fact] + public void ValidateRegressionLabelTypes() + { + ValidateLabelTypeTestCore(TaskKind.Regression, NumberDataViewType.Single, true); + ValidateLabelTypeTestCore(TaskKind.Regression, BooleanDataViewType.Instance, false); + ValidateLabelTypeTestCore(TaskKind.Regression, NumberDataViewType.Double, false); + ValidateLabelTypeTestCore(TaskKind.Regression, TextDataViewType.Instance, false); + } + + [Fact] + public void ValidateBinaryClassificationLabelTypes() + { + ValidateLabelTypeTestCore(TaskKind.BinaryClassification, NumberDataViewType.Single, false); + ValidateLabelTypeTestCore(TaskKind.BinaryClassification, BooleanDataViewType.Instance, true); + } + + [Fact] + public void ValidateMulticlassLabelTypes() + { + ValidateLabelTypeTestCore(TaskKind.MulticlassClassification, NumberDataViewType.Single, true); + ValidateLabelTypeTestCore(TaskKind.MulticlassClassification, BooleanDataViewType.Instance, true); + ValidateLabelTypeTestCore(TaskKind.MulticlassClassification, NumberDataViewType.Double, true); + ValidateLabelTypeTestCore(TaskKind.MulticlassClassification, TextDataViewType.Instance, true); + } + + [Fact] + public void ValidateAllowedFeatureColumnTypes() + { + var dataViewBuilder = new ArrayDataViewBuilder(new MLContext()); + dataViewBuilder.AddColumn("Boolean", BooleanDataViewType.Instance, false); + dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, 0f); + dataViewBuilder.AddColumn("Text", "a"); + dataViewBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, 0f); + var dataView = dataViewBuilder.GetDataView(); + UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, new ColumnInformation(), + null, TaskKind.Regression); + } + + [Fact] + public void ValidateProhibitedFeatureColumnType() + { + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn("UInt64", NumberDataViewType.UInt64); + schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + var dataView = DataViewTestFixture.BuildDummyDataView(schema); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, new ColumnInformation(), + null, TaskKind.Regression)); + Assert.StartsWith("Only supported feature column types are Boolean, Single, and String. Please change the feature column UInt64 of type UInt64 to one of the supported types.", ex.Message); + } + + [Fact] + public void ValidateEmptyTrainingDataThrows() + { + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn("Number", NumberDataViewType.Single); + schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + var dataView = DataViewTestFixture.BuildDummyDataView(schema, createDummyRow: false); + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, new ColumnInformation(), + null, TaskKind.Regression)); + Assert.StartsWith("Training data has 0 rows", ex.Message); + } + + [Fact] + public void ValidateEmptyValidationDataThrows() + { + // Training data + var dataViewBuilder = new ArrayDataViewBuilder(new MLContext()); + dataViewBuilder.AddColumn("Number", NumberDataViewType.Single, 0f); + dataViewBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single, 0f); + var trainingData = dataViewBuilder.GetDataView(); + + // Validation data + var schemaBuilder = new DataViewSchema.Builder(); + schemaBuilder.AddColumn("Number", NumberDataViewType.Single); + schemaBuilder.AddColumn(DefaultColumnNames.Label, NumberDataViewType.Single); + var schema = schemaBuilder.ToSchema(); + var validationData = DataViewTestFixture.BuildDummyDataView(schema, createDummyRow: false); + + var ex = Assert.Throws(() => UserInputValidationUtil.ValidateExperimentExecuteArgs(trainingData, new ColumnInformation(), + validationData, TaskKind.Regression)); + Assert.StartsWith("Validation data has 0 rows", ex.Message); + } + + private static void ValidateLabelTypeTestCore(TaskKind task, PrimitiveDataViewType labelType, bool labelTypeShouldBeValid) + { + var dataViewBuilder = new ArrayDataViewBuilder(new MLContext()); + dataViewBuilder.AddColumn(DefaultColumnNames.Features, NumberDataViewType.Single, 0f); + if (labelType == TextDataViewType.Instance) + { + dataViewBuilder.AddColumn(DefaultColumnNames.Label, string.Empty); + } + else + { + dataViewBuilder.AddColumn(DefaultColumnNames.Label, labelType, Activator.CreateInstance()); + } + var dataView = dataViewBuilder.GetDataView(); + var validationExceptionThrown = false; + try + { + UserInputValidationUtil.ValidateExperimentExecuteArgs(dataView, new ColumnInformation(), null, task); + } + catch + { + validationExceptionThrown = true; + } + Assert.Equal(labelTypeShouldBeValid, !validationExceptionThrown); + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/Util.cs b/test/Microsoft.ML.AutoML.Tests/Util.cs new file mode 100644 index 0000000000..c23046814f --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/Util.cs @@ -0,0 +1,37 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using Xunit; +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; + +namespace Microsoft.ML.AutoML.Test +{ + internal static class Util + { + public static void AssertObjectMatchesJson(string expectedJson, T obj) + { + var actualJson = JsonConvert.SerializeObject(obj, + Formatting.Indented, new JsonConverter[] { new StringEnumConverter() }); + Assert.Equal(expectedJson, actualJson); + } + + public static ValueGetter>> GetKeyValueGetter(IEnumerable colNames) + { + return (ref VBuffer> dst) => + { + var editor = VBufferEditor.Create(ref dst, colNames.Count()); + for (int i = 0; i < colNames.Count(); i++) + { + editor.Values[i] = colNames.ElementAt(i).AsMemory(); + } + dst = editor.Commit(); + }; + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/DataViewTestFixture.cs b/test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/DataViewTestFixture.cs new file mode 100644 index 0000000000..fe09167b3b --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/DataViewTestFixture.cs @@ -0,0 +1,62 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.AutoML.Test +{ + static class DataViewTestFixture + { + public static IDataView BuildDummyDataView(IEnumerable columns, bool createDummyRow = true) + { + return BuildDummyDataView(columns.Select(c => (c.Name, c.Type)), createDummyRow); + } + + public static IDataView BuildDummyDataView(DataViewSchema schema, bool createDummyRow = true) + { + return BuildDummyDataView(schema.Select(c => (c.Name, c.Type)), createDummyRow); + } + + public static IDataView BuildDummyDataView(IEnumerable<(string name, DataViewType type)> columns, bool createDummyRow = true) + { + var dataBuilder = new ArrayDataViewBuilder(new MLContext()); + + foreach (var column in columns) + { + if (column.type == NumberDataViewType.Single) + { + dataBuilder.AddColumn(column.name, NumberDataViewType.Single, createDummyRow ? new float[] { 0 } : new float[] { }); + } + if (column.type == NumberDataViewType.Double) + { + dataBuilder.AddColumn(column.name, NumberDataViewType.Double, createDummyRow ? new double[] { 0 } : new double[] { }); + } + if (column.type == NumberDataViewType.UInt64) + { + dataBuilder.AddColumn(column.name, NumberDataViewType.UInt64, createDummyRow ? new System.UInt64[] { 0 } : new System.UInt64[] { }); + } + else if (column.type == BooleanDataViewType.Instance) + { + dataBuilder.AddColumn(column.name, BooleanDataViewType.Instance, createDummyRow ? new bool[] { false } : new bool[] { }); + } + else if (column.type == TextDataViewType.Instance) + { + dataBuilder.AddColumn(column.name, createDummyRow ? new string[] { "a" } : new string[] { }); + } + else if (column.type.IsVector() && column.type.GetItemType() == NumberDataViewType.Single) + { + dataBuilder.AddColumn( + column.name, + Util.GetKeyValueGetter(createDummyRow ? new string[] { "1", "2" } : new string[] { }), + NumberDataViewType.Single, + createDummyRow ? new float[] { 0, 0 } : new float[] { }); + } + } + + return dataBuilder.GetDataView(); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/MLNetUtils.cs b/test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/MLNetUtils.cs new file mode 100644 index 0000000000..4997d66a26 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/Utils/MLNetUtils/MLNetUtils.cs @@ -0,0 +1,23 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; + +namespace Microsoft.ML.AutoML.Test +{ + internal static class MLNetUtils + { + public static bool[] BuildArray(int length, IEnumerable columnsNeeded) + { + var result = new bool[length]; + foreach (var col in columnsNeeded) + { + if (col.Index < result.Length) + result[col.Index] = true; + } + + return result; + } + } +} diff --git a/test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticAutoFit.cs b/test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticAutoFit.cs new file mode 100644 index 0000000000..afaf3e5b13 --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticAutoFit.cs @@ -0,0 +1,144 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML.Test +{ + public enum TaskType + { + Classification = 1, + Regression + } + + /// + /// make AutoFit and Score calls uniform across task types + /// + internal class TaskAgnosticAutoFit + { + private TaskType taskType; + private MLContext context; + + internal interface IUniversalProgressHandler : IProgress>, IProgress> + { + } + + internal TaskAgnosticAutoFit(TaskType taskType, MLContext context) + { + this.taskType = taskType; + this.context = context; + } + + internal IEnumerable AutoFit( + IDataView trainData, + string label, + int maxModels, + uint maxExperimentTimeInSeconds, + IDataView validationData = null, + IEstimator preFeaturizers = null, + IEnumerable<(string, ColumnPurpose)> columnPurposes = null, + IUniversalProgressHandler progressHandler = null) + { + var columnInformation = new ColumnInformation() { LabelColumnName = label }; + + switch (this.taskType) + { + case TaskType.Classification: + + var mcs = new MulticlassExperimentSettings + { + OptimizingMetric = MulticlassClassificationMetric.MicroAccuracy, + + MaxExperimentTimeInSeconds = maxExperimentTimeInSeconds, + MaxModels = maxModels + }; + + var classificationResult = this.context.Auto() + .CreateMulticlassClassificationExperiment(mcs) + .Execute( + trainData, + validationData, + columnInformation, + progressHandler: progressHandler); + + var iterationResults = classificationResult.RunDetails.Select(i => new TaskAgnosticIterationResult(i)).ToList(); + + return iterationResults; + + case TaskType.Regression: + + var rs = new RegressionExperimentSettings + { + OptimizingMetric = RegressionMetric.RSquared, + + MaxExperimentTimeInSeconds = maxExperimentTimeInSeconds, + MaxModels = maxModels + }; + + var regressionResult = this.context.Auto() + .CreateRegressionExperiment(rs) + .Execute( + trainData, + validationData, + columnInformation, + progressHandler: progressHandler); + + iterationResults = regressionResult.RunDetails.Select(i => new TaskAgnosticIterationResult(i)).ToList(); + + return iterationResults; + + default: + throw new ArgumentException($"Unknown task type {this.taskType}.", "TaskType"); + } + } + + internal struct ScoreResult + { + public IDataView ScoredTestData; + public double PrimaryMetricResult; + public Dictionary Metrics; + } + + internal ScoreResult Score( + IDataView testData, + ITransformer model, + string label) + { + var result = new ScoreResult(); + + result.ScoredTestData = model.Transform(testData); + + switch (this.taskType) + { + case TaskType.Classification: + + var classificationMetrics = context.MulticlassClassification.Evaluate(result.ScoredTestData, labelColumnName: label); + + //var classificationMetrics = context.MulticlassClassification.(scoredTestData, labelColumnName: label); + result.PrimaryMetricResult = classificationMetrics.MicroAccuracy; // TODO: don't hardcode metric + result.Metrics = TaskAgnosticIterationResult.MetricValuesToDictionary(classificationMetrics); + + break; + + case TaskType.Regression: + + var regressionMetrics = context.Regression.Evaluate(result.ScoredTestData, labelColumnName: label); + + result.PrimaryMetricResult = regressionMetrics.RSquared; // TODO: don't hardcode metric + result.Metrics = TaskAgnosticIterationResult.MetricValuesToDictionary(regressionMetrics); + + break; + + default: + throw new ArgumentException($"Unknown task type {this.taskType}.", "TaskType"); + } + + return result; + } + } +} + diff --git a/test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticIterationResult.cs b/test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticIterationResult.cs new file mode 100644 index 0000000000..462872dc9f --- /dev/null +++ b/test/Microsoft.ML.AutoML.Tests/Utils/TaskAgnosticIterationResult.cs @@ -0,0 +1,87 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.AutoML.Test +{ + internal class TaskAgnosticIterationResult + { + internal double PrimaryMetricValue; + + internal Dictionary MetricValues = new Dictionary(); + + internal readonly ITransformer Model; + internal readonly Exception Exception; + internal string TrainerName; + internal double RuntimeInSeconds; + internal IEstimator Estimator; + internal Pipeline Pipeline; + internal int PipelineInferenceTimeInSeconds; + + private string primaryMetricName; + + private TaskAgnosticIterationResult(RunDetail baseRunDetail, object validationMetrics, string primaryMetricName) + { + this.TrainerName = baseRunDetail.TrainerName; + this.Estimator = baseRunDetail.Estimator; + this.Pipeline = baseRunDetail.Pipeline; + + this.PipelineInferenceTimeInSeconds = (int)baseRunDetail.PipelineInferenceTimeInSeconds; + this.RuntimeInSeconds = (int)baseRunDetail.RuntimeInSeconds; + + this.primaryMetricName = primaryMetricName; + this.PrimaryMetricValue = -1; // default value in case of exception. TODO: won't work for minimizing metrics, use nullable? + + if (validationMetrics == null) + { + return; + } + + this.MetricValues = MetricValuesToDictionary(validationMetrics); + + this.PrimaryMetricValue = this.MetricValues[this.primaryMetricName]; + } + + public TaskAgnosticIterationResult(RunDetail runDetail, string primaryMetricName = "RSquared") + : this(runDetail, runDetail.ValidationMetrics, primaryMetricName) + { + if (runDetail.Exception == null) + { + this.Model = runDetail.Model; + } + + this.Exception = runDetail.Exception; + } + + public TaskAgnosticIterationResult(RunDetail runDetail, string primaryMetricName = "MicroAccuracy") + : this(runDetail, runDetail.ValidationMetrics, primaryMetricName) + { + if (runDetail.Exception == null) + { + this.Model = runDetail.Model; + } + + this.Exception = runDetail.Exception; + } + + public static Dictionary MetricValuesToDictionary(T metric) + { + var supportedTypes = new[] { typeof(MulticlassClassificationMetrics), typeof(RegressionMetrics) }; + + if (!supportedTypes.Contains(metric.GetType())) + { + throw new ArgumentException($"Unsupported metric type {typeof(T).Name}."); + } + + var propertiesToReport = metric.GetType().GetProperties().Where(p => p.PropertyType == typeof(double)); + + return propertiesToReport.ToDictionary(p => p.Name, p => (double)metric.GetType().GetProperty(p.Name).GetValue(metric)); + } + } +} + diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index d5bf30ab96..e9727d13a7 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -3760,21 +3760,6 @@ public void EntryPointWordEmbeddings() } } - [TensorFlowFact] - public void EntryPointTensorFlowTransform() - { - Env.ComponentCatalog.RegisterAssembly(typeof(TensorFlowTransformer).Assembly); - - TestEntryPointPipelineRoutine(GetDataPath("Train-Tiny-28x28.txt"), "col=Label:R4:0 col=Placeholder:R4:1-784", - new[] { "Transforms.TensorFlowScorer" }, - new[] - { - @"'InputColumns': [ 'Placeholder' ], - 'ModelLocation': 'mnist_model/frozen_saved_model.pb', - 'OutputColumns': [ 'Softmax' ]" - }); - } - [Fact(Skip = "Needs real time series dataset. https://github.com/dotnet/machinelearning/issues/1120")] public void EntryPointSsaChangePoint() { @@ -5637,6 +5622,21 @@ public void TestOvaMacroWithUncalibratedLearner() } } + [TensorFlowFact] + public void EntryPointTensorFlowTransform() + { + Env.ComponentCatalog.RegisterAssembly(typeof(TensorFlowTransformer).Assembly); + + TestEntryPointPipelineRoutine(GetDataPath("Train-Tiny-28x28.txt"), "col=Label:R4:0 col=Placeholder:R4:1-784", + new[] { "Transforms.TensorFlowScorer" }, + new[] + { + @"'InputColumns': [ 'Placeholder' ], + 'ModelLocation': 'mnist_model/frozen_saved_model.pb', + 'OutputColumns': [ 'Softmax' ]" + }); + } + [TensorFlowFact] public void TestTensorFlowEntryPoint() { diff --git a/test/Microsoft.ML.Predictor.Tests/Test-API.cs b/test/Microsoft.ML.Predictor.Tests/Test-API.cs index 2a7c61d215..f51444d27d 100644 --- a/test/Microsoft.ML.Predictor.Tests/Test-API.cs +++ b/test/Microsoft.ML.Predictor.Tests/Test-API.cs @@ -81,13 +81,13 @@ public void MulticlassExampleTest() // BulkPredict, so this wasn't using FastRank's BulkPredict. Float[][] bulkPredictions = ((IBulkPredictor)pred).BulkPredict(instances); - Assert.AreEqual(predictions.Length, bulkPredictions.Length); + Assert.Equal(predictions.Length, bulkPredictions.Length); for (int i = 0; i < predictions.Length; i++) { - Assert.AreEqual(predictions[i].Length, bulkPredictions[i].Length); + Assert.Equal(predictions[i].Length, bulkPredictions[i].Length); for (int j = 0; j < predictions[i].Length; j++) { - Assert.AreEqual(predictions[i][j], bulkPredictions[i][j]); + Assert.Equal(predictions[i][j], bulkPredictions[i][j]); } } @@ -103,11 +103,11 @@ public void MulticlassExampleTest() // sanity check vs. original predictor var results2 = new MulticlassTester(new MulticlassTesterArguments()).Test(predictor, instances); - Assert.AreEqual(results.Length, results2.Length); + Assert.Equal(results.Length, results2.Length); for (int i = 0; i < results.Length; i++) { - Assert.AreEqual(results[i].Name, results2[i].Name); - Assert.AreEqual(results[i].Value, results2[i].Value); + Assert.Equal(results[i].Name, results2[i].Name); + Assert.Equal(results[i].Value, results2[i].Value); } } File.Delete(modelFilename); @@ -185,12 +185,12 @@ public void SimpleExampleTest() Float[] bulkPredictions = ((IBulkPredictor)pred).BulkPredict(instances); - Assert.AreEqual(rawPredictions.Length, bulkPredictions.Length); - Assert.AreEqual(rawPredictions.Length, rawPredictions1.Length); + Assert.Equal(rawPredictions.Length, bulkPredictions.Length); + Assert.Equal(rawPredictions.Length, rawPredictions1.Length); for (int i = 0; i < rawPredictions.Length; i++) - Assert.AreEqual(rawPredictions[i], bulkPredictions[i]); + Assert.Equal(rawPredictions[i], bulkPredictions[i]); for (int i = 0; i < rawPredictions.Length; i++) - Assert.AreEqual(rawPredictions[i], rawPredictions1[i]); + Assert.Equal(rawPredictions[i], rawPredictions1[i]); //test new testers { @@ -204,11 +204,11 @@ public void SimpleExampleTest() // sanity check vs. original predictor var results2 = new ClassifierTester(new ProbabilityPredictorTesterArguments()).Test(predictor, instances); - Assert.AreEqual(results.Length, results2.Length); + Assert.Equal(results.Length, results2.Length); for (int i = 0; i < results.Length; i++) { - Assert.AreEqual(results[i].Name, results2[i].Name); - Assert.AreEqual(results[i].Value, results2[i].Value); + Assert.Equal(results[i].Name, results2[i].Name); + Assert.Equal(results[i].Value, results2[i].Value); } } File.Delete(modelFilename); @@ -231,7 +231,7 @@ public void FactoryExampleTest() ///********* Training a model *******// string modelFilename = Path.GetTempFileName(); TLCArguments cmd = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(dataset.extraSettings, cmd)); + Assert.True(CmdParser.ParseArguments(dataset.extraSettings, cmd)); cmd.command = Command.Train; cmd.modelfile = modelFilename; cmd.datafile = dataFilename; @@ -263,7 +263,7 @@ public void FactoryExampleTest() continue; string[] cols = text.Split(','); - Assert.IsTrue(cols.Length == 15); + Assert.True(cols.Length == 15); if (headerSkip) { @@ -316,8 +316,8 @@ public void FactoryExampleTest() originalProbabilities.Add(probability); } - CollectionAssert.AreEqual(outputs, originalOutputs); - CollectionAssert.AreEqual(probabilities, originalProbabilities); + CollectionAssert.Equal(outputs, originalOutputs); + CollectionAssert.Equal(probabilities, originalProbabilities); File.Delete(modelFilename); @@ -401,10 +401,10 @@ private void WeightedMetricTest(Instances noWeights, Instances weights1, Instanc if (results[i] == null) continue; //The nonweighted result should have half of the metrics - Assert.AreEqual(results[i].Length, results[0].Length * 2); + Assert.Equal(results[i].Length, results[0].Length * 2); for (int m = 0; m < results[0].Length; m++) { - Assert.AreEqual(results[0][m].Name, results[i][m].Name); + Assert.Equal(results[0][m].Name, results[i][m].Name); Double diff = Math.Abs(results[0][m].Value - results[i][m].Value); if (diff > 1e-6) { @@ -416,8 +416,8 @@ private void WeightedMetricTest(Instances noWeights, Instances weights1, Instanc //Compare all metrics between weight 1 (with and without explicit weight in the input) for (int m = 0; m < results[0].Length; m++) { - Assert.IsTrue(Math.Abs(results[0][m].Value - results[1][m].Value) < 1e-10); - Assert.IsTrue(Math.Abs(results[0][m].Value - results[1][m + results[0].Length].Value) < 1e-10); + Assert.True(Math.Abs(results[0][m].Value - results[1][m].Value) < 1e-10); + Assert.True(Math.Abs(results[0][m].Value - results[1][m + results[0].Length].Value) < 1e-10); } } } diff --git a/test/Microsoft.ML.Predictor.Tests/TestCreateInstances.cs b/test/Microsoft.ML.Predictor.Tests/TestCreateInstances.cs index 1116e337fb..fed580a70e 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestCreateInstances.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestCreateInstances.cs @@ -42,7 +42,7 @@ public void TestCreateTextInstances() outFile, statsFile, outTestFile, testStatsFile, outValidFile, validStatsFile); argsStr += " /writer TextInstanceWriter{/stats=+} /disableTracking=+"; var args = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsStr, args)); + Assert.True(CmdParser.ParseArguments(argsStr, args)); RunExperiments.Run(args); @@ -75,7 +75,7 @@ public void TestCreateTextInstancesConstant() trainData, breast.extraSettings, outFile, statsFile); argsStr += " writer=TextInstanceWriter{stats+} disableTracking+"; var args = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsStr, args)); + Assert.True(CmdParser.ParseArguments(argsStr, args)); RunExperiments.Run(args); @@ -103,7 +103,7 @@ public void TestCreateTextInstancesWithNormalization() "/c=CreateInstances {0} /test={1} /norm=MinMaxNormalizer /{2} /cifile={3} /citestfile={4}", trainData, testData, transArgs, outFile1, outTestFile1); var args1 = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsStr1, args1)); + Assert.True(CmdParser.ParseArguments(argsStr1, args1)); RunExperiments.Run(args1); @@ -111,7 +111,7 @@ public void TestCreateTextInstancesWithNormalization() "/c=CreateInstances {0} /test={1} /inst Trans{{trans=MinMaxNormalizer {2}}} /cifile={3} /citestfile={4}", trainData, testData, transArgs, outFile2, outTestFile2); var args2 = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsStr2, args2)); + Assert.True(CmdParser.ParseArguments(argsStr2, args2)); RunExperiments.Run(args2); @@ -128,7 +128,7 @@ public void TestCreateTextInstancesWithNormalization() private void CompareInstances(TlcTextInstances instances1, TlcTextInstances instances2) { - Assert.IsTrue(instances1.Schema.NumFeatures == instances2.Schema.NumFeatures, "mismatch on schema features"); + Assert.True(instances1.Schema.NumFeatures == instances2.Schema.NumFeatures, "mismatch on schema features"); using (var e1 = instances1.GetEnumerator()) using (var e2 = instances2.GetEnumerator()) @@ -137,16 +137,16 @@ private void CompareInstances(TlcTextInstances instances1, TlcTextInstances inst { bool b1 = e1.MoveNext(); bool b2 = e2.MoveNext(); - Assert.IsTrue(b1 == b2, "different number of instances"); + Assert.True(b1 == b2, "different number of instances"); if (!b1) break; var inst1 = e1.Current; var inst2 = e2.Current; - Assert.IsTrue(inst1.Label == inst2.Label, "mismatch on instance label"); - Assert.IsTrue(inst1.NumFeatures == inst2.NumFeatures, "mismatch on number of features"); - Assert.IsTrue(inst1.NumFeatures == instances1.Schema.NumFeatures, "mismatch on number of instance vs. schema features"); - Assert.IsTrue(Utils.AreEqual(inst1.Features.Values, inst2.Features.Values), "mismatch on feature values"); - Assert.IsTrue(Utils.AreEqual(inst1.Features.Indices, inst2.Features.Indices), "mismatch on feature indices"); + Assert.True(inst1.Label == inst2.Label, "mismatch on instance label"); + Assert.True(inst1.NumFeatures == inst2.NumFeatures, "mismatch on number of features"); + Assert.True(inst1.NumFeatures == instances1.Schema.NumFeatures, "mismatch on number of instance vs. schema features"); + Assert.True(Utils.AreEqual(inst1.Features.Values, inst2.Features.Values), "mismatch on feature values"); + Assert.True(Utils.AreEqual(inst1.Features.Indices, inst2.Features.Indices), "mismatch on feature indices"); } } } @@ -156,7 +156,7 @@ public void TestPcaTransform() { // Force Microsoft.ML.PCA assembly to be loaded into the AppDomain so // ReflectionUtils.FindClassCore does not return null when called by ReflectionUtils.CreateInstance - Assert.AreEqual(typeof(PCAPredictor).Name, "PCAPredictor"); + Assert.Equal(typeof(PCAPredictor).Name, "PCAPredictor"); string trainData = GetDataPath(TestDatasets.mnistTiny28.trainFilename); string fileName = TestContext.TestName + "-Train.txt"; @@ -170,7 +170,7 @@ public void TestPcaTransform() pcaTransformArgs, outFile); var args1 = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsStr1, args1)); + Assert.True(CmdParser.ParseArguments(argsStr1, args1)); RunExperiments.Run(args1); CheckEquality(Dir, fileName); @@ -180,14 +180,14 @@ public void TestPcaTransform() Double[] l1norms = new Double[rank]; foreach (Instance instance in outputInstances) { - Assert.IsTrue(instance.Features.Count == rank); + Assert.True(instance.Features.Count == rank); for (int i = 0; i < instance.Features.Values.Length; i++) l1norms[i] += (instance.Features.Values[i] < 0 ? -instance.Features.Values[i] : instance.Features.Values[i]); } for (int i = 0; i < l1norms.Length - 1; i++) { - Assert.IsTrue(l1norms[i] > l1norms[i + 1]); + Assert.True(l1norms[i] > l1norms[i + 1]); } Done(); @@ -205,7 +205,7 @@ public void TestFeatureHandlerIncorrectMapping() dataModelFile, ciFile); var args = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsString, args)); + Assert.True(CmdParser.ParseArguments(argsString, args)); RunExperiments.Run(args); string ciFailFile = DeleteOutputPath(Dir, TestContext.TestName + "-ci-fail.tsv"); @@ -215,7 +215,7 @@ public void TestFeatureHandlerIncorrectMapping() dataModelFile, ciFailFile); args = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsString, args)); + Assert.True(CmdParser.ParseArguments(argsString, args)); try { RunExperiments.Run(args); @@ -223,7 +223,7 @@ public void TestFeatureHandlerIncorrectMapping() } catch (Exception ex) { - Assert.IsTrue(ex.GetBaseException() is InvalidOperationException); + Assert.True(ex.GetBaseException() is InvalidOperationException); } Done(); diff --git a/test/Microsoft.ML.Predictor.Tests/TestCrossValidation.cs b/test/Microsoft.ML.Predictor.Tests/TestCrossValidation.cs index 0b7540bbd5..300ae3d235 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestCrossValidation.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestCrossValidation.cs @@ -25,19 +25,19 @@ public void TestRandomBalancedFoldCreation() int[] folds = foldCreator.CreateFoldIndicesStratified(li, cmd, new Random(1)); int[] expectedIndices = { 1, 0, 3, 4, 2 }; for (int i = 0; i < folds.Length; i++) - Assert.AreEqual(folds[i], expectedIndices[i]); + Assert.Equal(folds[i], expectedIndices[i]); li = CreateInstancesWithNKeys(7); folds = foldCreator.CreateFoldIndicesStratified(li, cmd, new Random(1)); expectedIndices = new int[] { 1, 0, 4, 1, 0, 2, 3 }; for (int i = 0; i < folds.Length; i++) - Assert.AreEqual(folds[i], expectedIndices[i]); + Assert.Equal(folds[i], expectedIndices[i]); li = CreateInstancesWithNKeys(10); folds = foldCreator.CreateFoldIndicesStratified(li, cmd, new Random(1)); expectedIndices = new int[] { 2, 1, 0, 3, 2, 4, 0, 4, 3, 1 }; for (int i = 0; i < folds.Length; i++) - Assert.AreEqual(folds[i], expectedIndices[i]); + Assert.Equal(folds[i], expectedIndices[i]); Done(); } diff --git a/test/Microsoft.ML.Predictor.Tests/TestIniModels.cs b/test/Microsoft.ML.Predictor.Tests/TestIniModels.cs index 8ef374d434..00158a6234 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestIniModels.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestIniModels.cs @@ -315,7 +315,7 @@ public void RunAllIniFileEvaluationTests( ); } } - Assert.IsTrue(failureTestInformation.Count <= 0); + Assert.True(failureTestInformation.Count <= 0); } /// @@ -351,7 +351,7 @@ public void RunIniFileEvaluationTest( string modelFilePath = GetOutputPath(runParameters.BaselineDir, runParameters.ModelFilename); string trainDatasetPath = GetDataPath(trainDataset); string evaluationOutputDir = GetOutputDir(evaluationOutputDirPrefix + @"\Dirs\" + outName); - Assert.IsNull(EnsureEmptyDirectory(evaluationOutputDir)); + Assert.Null(EnsureEmptyDirectory(evaluationOutputDir)); string cmd = string.Format(EvaluationCommandLineFormat, modelFilePath, evaluationOutputDir, trainDatasetPath); string dir = Path.GetFullPath(EvaluationExecutorDir); diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index 1657c34c96..6daceea70e 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -1237,14 +1237,14 @@ public void RegressorSyntheticOlsTest() var pred = trainer.CreatePredictor(); pred = WriteReloadOlsPredictor(pred); - Assert.AreEqual(featureCount, pred.InputType.VectorSize, "Unexpected input size"); - Assert.IsFalse(pred.HasStatistics, "Should not have statistics with exact specified model"); - Assert.AreEqual(null, pred.PValues, "Should not have p-values with no-stats model"); - Assert.AreEqual(null, pred.TValues, "Should not have t-values with no-stats model"); - Assert.AreEqual(null, pred.StandardErrors, "Should not have standard errors with no-stats model"); - Assert.IsTrue(Double.IsNaN(pred.RSquaredAdjusted), "R-squared adjusted should be NaN with no-stats model"); + Assert.Equal(featureCount, pred.InputType.VectorSize, "Unexpected input size"); + Assert.False(pred.HasStatistics, "Should not have statistics with exact specified model"); + Assert.Null(pred.PValues, "Should not have p-values with no-stats model"); + Assert.Null(pred.TValues, "Should not have t-values with no-stats model"); + Assert.Null(pred.StandardErrors, "Should not have standard errors with no-stats model"); + Assert.True(Double.IsNaN(pred.RSquaredAdjusted), "R-squared adjusted should be NaN with no-stats model"); foreach (Instance inst in subinstances) - Assert.AreEqual(inst.Label, pred.Predict(inst), tol, "Mismatch on example id {0}", inst.Id); + Assert.Equal(inst.Label, pred.Predict(inst), tol, "Mismatch on example id {0}", inst.Id); } float finalNorm; @@ -1255,16 +1255,16 @@ public void RegressorSyntheticOlsTest() trainer.Train(instances); var pred = trainer.CreatePredictor(); pred = WriteReloadOlsPredictor(pred); - Assert.AreEqual(featureCount, pred.InputType.VectorSize, "Unexpected input size"); - Assert.IsTrue(pred.HasStatistics, "Should have statistics"); - Assert.AreEqual(1.0, pred.RSquared, 1e-6, "Coefficient of determination should be 1 for exact specified model"); - Assert.IsTrue(FloatUtils.IsFinite(pred.RSquaredAdjusted), "R-squared adjusted should be finite with exact specified model"); - Assert.AreEqual(featureCount, pred.Weights.Count, "Wrong number of weights"); - Assert.AreEqual(featureCount + 1, pred.PValues.Count, "Wrong number of pvalues"); - Assert.AreEqual(featureCount + 1, pred.TValues.Count, "Wrong number of t-values"); - Assert.AreEqual(featureCount + 1, pred.StandardErrors.Count, "Wrong number of standard errors"); + Assert.Equal(featureCount, pred.InputType.VectorSize, "Unexpected input size"); + Assert.True(pred.HasStatistics, "Should have statistics"); + Assert.Equal(1.0, pred.RSquared, 1e-6, "Coefficient of determination should be 1 for exact specified model"); + Assert.True(FloatUtils.IsFinite(pred.RSquaredAdjusted), "R-squared adjusted should be finite with exact specified model"); + Assert.Equal(featureCount, pred.Weights.Count, "Wrong number of weights"); + Assert.Equal(featureCount + 1, pred.PValues.Count, "Wrong number of pvalues"); + Assert.Equal(featureCount + 1, pred.TValues.Count, "Wrong number of t-values"); + Assert.Equal(featureCount + 1, pred.StandardErrors.Count, "Wrong number of standard errors"); foreach (Instance inst in instances) - Assert.AreEqual(inst.Label, pred.Predict(inst), tol, "Mismatch on example id {0}", inst.Id); + Assert.Equal(inst.Label, pred.Predict(inst), tol, "Mismatch on example id {0}", inst.Id); finalNorm = pred.Weights.Sum(x => x * x); // Suppress statistics and retrain. @@ -1275,17 +1275,17 @@ public void RegressorSyntheticOlsTest() var pred2 = trainer2.CreatePredictor(); pred2 = WriteReloadOlsPredictor(pred2); - Assert.AreEqual(null, pred2.PValues, "P-values present but should be absent"); - Assert.AreEqual(null, pred2.TValues, "T-values present but should be absent"); - Assert.AreEqual(null, pred2.StandardErrors, "Standard errors present but should be absent"); - Assert.AreEqual(pred.RSquared, pred2.RSquared); - Assert.AreEqual(pred.RSquaredAdjusted, pred2.RSquaredAdjusted); - Assert.AreEqual(pred.Bias, pred2.Bias); + Assert.Null(pred2.PValues, "P-values present but should be absent"); + Assert.Null(pred2.TValues, "T-values present but should be absent"); + Assert.Null(pred2.StandardErrors, "Standard errors present but should be absent"); + Assert.Equal(pred.RSquared, pred2.RSquared); + Assert.Equal(pred.RSquaredAdjusted, pred2.RSquaredAdjusted); + Assert.Equal(pred.Bias, pred2.Bias); var w1 = pred.Weights.ToArray(); var w2 = pred2.Weights.ToArray(); - Assert.AreEqual(w1.Length, w2.Length); + Assert.Equal(w1.Length, w2.Length); for (int i = 0; i < w1.Length; ++i) - Assert.AreEqual(w1[i], w2[i]); + Assert.Equal(w1[i], w2[i]); } float[] regularizationParams = new float[] { 0, (float)0.01, (float)0.1 }; @@ -1328,7 +1328,7 @@ public void RegressorSyntheticOlsTest() { caught = true; } - Assert.IsTrue(caught, "Failed to encounter an error, when running OLS on a deficient system"); + Assert.True(caught, "Failed to encounter an error, when running OLS on a deficient system"); continue; } else @@ -1337,8 +1337,8 @@ public void RegressorSyntheticOlsTest() } var pred = trainer.CreatePredictor(); pred = WriteReloadOlsPredictor(pred); - Assert.AreEqual(featureCount, pred.InputType.VectorSize, "Unexpected input size"); - Assert.IsTrue(0 <= pred.RSquared && pred.RSquared < 1, "R-squared not in expected range"); + Assert.Equal(featureCount, pred.InputType.VectorSize, "Unexpected input size"); + Assert.True(0 <= pred.RSquared && pred.RSquared < 1, "R-squared not in expected range"); Func, float> getError = p => noisyInstances.Select(inst => inst.Label - p(inst)).Sum(e => e * e); @@ -1350,7 +1350,7 @@ public void RegressorSyntheticOlsTest() float referenceCost = referenceError + regParam2 * referenceNorm; float smoothing = (float)(referenceCost * 5e-6); Log("Reference cost is {0} + {1} * {2} = {3}, upper bound was {4}", referenceError, regParam2, referenceNorm, referenceCost, boundCost); - Assert.IsTrue(boundCost > referenceCost, "Reference cost {0} was above theoretical upper bound {1}", referenceCost, boundCost); + Assert.True(boundCost > referenceCost, "Reference cost {0} was above theoretical upper bound {1}", referenceCost, boundCost); float lastCost = 0; var weights = pred.Weights.Sum(x => x * x); for (int trial = 0; trial < model.Length * 2; ++trial) @@ -1358,7 +1358,7 @@ public void RegressorSyntheticOlsTest() int param = trial / 2; bool up = (trial & 1) == 1; float[] w = pred.Weights.ToArray(); - Assert.AreEqual(featureCount, w.Length); + Assert.Equal(featureCount, w.Length); float b = pred.Bias; bool isBias = param == featureCount; float normDelta; @@ -1381,7 +1381,7 @@ public void RegressorSyntheticOlsTest() string desc = string.Format("after wiggling {0} {1} from {2} to {3}", isBias ? "bias" : string.Format("weight[{0}]", param), up ? "up" : "down", origValue, newValue); Log("Finite difference cost is {0} ({1}), {2}", wiggledCost, wiggledCost - referenceCost, desc); - Assert.IsTrue(wiggledCost > referenceCost * (float)(1 - 5e-7), "Finite difference cost {0} not higher than reference cost {1}, {2}", + Assert.True(wiggledCost > referenceCost * (float)(1 - 5e-7), "Finite difference cost {0} not higher than reference cost {1}, {2}", wiggledCost, referenceCost, desc); if (up) { @@ -1389,7 +1389,7 @@ public void RegressorSyntheticOlsTest() // equal amounts up and down should lead to *roughly* the same error. float ratio = 1 - (lastCost - referenceCost + smoothing) / (wiggledCost - referenceCost + smoothing); Log("Wiggled up had a relative difference of {0:0.0%} vs. wiggled down", ratio); - Assert.IsTrue(0.1 > Math.Abs(ratio), "Ratio {0} of up/down too high, {1}", ratio, desc); + Assert.True(0.1 > Math.Abs(ratio), "Ratio {0} of up/down too high, {1}", ratio, desc); } lastCost = wiggledCost; } @@ -1477,13 +1477,13 @@ public void RegressorSyntheticDuplicatedOlsTest() var pred2 = trainer2.CreatePredictor(); var tol = 1e-5; - Assert.AreEqual(pred.RSquared, pred2.RSquared, tol); - Assert.AreEqual(pred.Bias, pred2.Bias, tol); + Assert.Equal(pred.RSquared, pred2.RSquared, tol); + Assert.Equal(pred.Bias, pred2.Bias, tol); var w1 = pred.Weights.ToArray(); var w2 = pred2.Weights.ToArray(); - Assert.AreEqual(w1.Length, w2.Length); + Assert.Equal(w1.Length, w2.Length); for (int i = 0; i < w1.Length; ++i) - Assert.AreEqual(w1[i], w2[i], tol); + Assert.Equal(w1[i], w2[i], tol); Done(); } @@ -1877,7 +1877,7 @@ private void CompareSvmToLibSvmCore(string kernelType, string kernel, IHostEnvir predictions1.Add(res1); predictions2.Add(res2); - Assert.IsTrue(AreEqual(res1, res2, MaxRelError, Epsilon), + Assert.True(AreEqual(res1, res2, MaxRelError, Epsilon), "Found prediction that does not match the libsvm prediction in line {0}, using {1}", instanceNum, kernelType); instanceNum++; @@ -1892,7 +1892,7 @@ private void CompareSvmToLibSvmCore(string kernelType, string kernel, IHostEnvir for (int i = 0; i < predictions1.Count - 1; i++) { - Assert.IsTrue(IsLessThanOrEqual(predArray1[i], predArray1[i + 1], MaxRelError, Epsilon), + Assert.True(IsLessThanOrEqual(predArray1[i], predArray1[i + 1], MaxRelError, Epsilon), "Different ordering of our results and libsvm results"); } } @@ -2329,7 +2329,7 @@ public void TestFeatureHandlerModelReuse() dataModelFile, ciFile); var args = new TLCArguments(); - Assert.IsTrue(CmdParser.ParseArguments(argsString, args)); + Assert.True(CmdParser.ParseArguments(argsString, args)); RunExperiments.Run(args); // REVIEW: think of a test that would distinguish more dramatically the case when /im works and when it doesn't diff --git a/test/Microsoft.ML.Predictor.Tests/TestTrivialPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestTrivialPredictors.cs index c95b6d220a..d66a2a9b12 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestTrivialPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestTrivialPredictors.cs @@ -16,14 +16,14 @@ public class TestTrivialPredictors { private static void CheckOutput(Float expected, Float actual) { - Assert.AreEqual(expected, actual, "difference between original/serialized models output"); + Assert.Equal(expected, actual, "difference between original/serialized models output"); } private static void CheckOutput(Float[] expected, Float[] actual) { - Assert.AreEqual(expected.Length, actual.Length, "difference between original/serialized models output length"); + Assert.Equal(expected.Length, actual.Length, "difference between original/serialized models output length"); for (int i = 0; i < expected.Length; ++i) - Assert.AreEqual(expected[i], actual[i], "difference between original/serialized models output index {0}", i); + Assert.Equal(expected[i], actual[i], "difference between original/serialized models output index {0}", i); } /// @@ -57,7 +57,7 @@ private static void TrivialHelper(Func), loadedPredictor, "did not load expected model"); + Assert.NotEqual(default(IPredictor), loadedPredictor, "did not load expected model"); } TOutput result = predictor.Predict(instances[0]); diff --git a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs deleted file mode 100644 index ef62dcae77..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs +++ /dev/null @@ -1,99 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.IO; -using Microsoft.ML.Data; -using Microsoft.ML.Transforms.Image; -using Microsoft.ML.TestFramework.Attributes; -using Microsoft.ML.Transforms; -using Xunit; - -namespace Microsoft.ML.Scenarios -{ - public partial class ScenariosTests - { - [TensorFlowFact] - public void TensorFlowTransforCifarEndToEndTest() - { - var imageHeight = 32; - var imageWidth = 32; - var model_location = "cifar_model/frozen_model.pb"; - var dataFile = GetDataPath("images/images.tsv"); - var imageFolder = Path.GetDirectoryName(dataFile); - - var mlContext = new MLContext(seed: 1); - var data = TextLoader.Create(mlContext, new TextLoader.Options() - { - Columns = new[] - { - new TextLoader.Column("ImagePath", DataKind.String, 0), - new TextLoader.Column("Label", DataKind.String, 1), - } - }, new MultiFileSource(dataFile)); - - var pipeEstimator = new ImageLoadingEstimator(mlContext, imageFolder, ("ImageReal", "ImagePath")) - .Append(new ImageResizingEstimator(mlContext, "ImageCropped", imageHeight, imageWidth, "ImageReal")) - .Append(new ImagePixelExtractingEstimator(mlContext, "Input", "ImageCropped", interleavePixelColors: true)) - .Append(mlContext.Model.LoadTensorFlowModel(model_location).ScoreTensorFlowModel("Output", "Input")) - .Append(new ColumnConcatenatingEstimator(mlContext, "Features", "Output")) - .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) - .AppendCacheCheckpoint(mlContext) - .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy()); - - - var transformer = pipeEstimator.Fit(data); - var predictions = transformer.Transform(data); - - var metrics = mlContext.MulticlassClassification.Evaluate(predictions); - Assert.Equal(1, metrics.MicroAccuracy, 2); - - var predictFunction = mlContext.Model.CreatePredictionEngine(transformer); - var prediction = predictFunction.Predict(new CifarData() - { - ImagePath = GetDataPath("images/banana.jpg") - }); - Assert.Equal(0, prediction.PredictedScores[0], 2); - Assert.Equal(1, prediction.PredictedScores[1], 2); - Assert.Equal(0, prediction.PredictedScores[2], 2); - - prediction = predictFunction.Predict(new CifarData() - { - ImagePath = GetDataPath("images/hotdog.jpg") - }); - Assert.Equal(0, prediction.PredictedScores[0], 2); - Assert.Equal(0, prediction.PredictedScores[1], 2); - Assert.Equal(1, prediction.PredictedScores[2], 2); - } - } - - public class CifarData - { - [LoadColumn(0)] - public string ImagePath; - - [LoadColumn(1)] - public string Label; - } - - public class CifarPrediction - { - [ColumnName("Score")] - public float[] PredictedScores; - } - - public class ImageNetData - { - [LoadColumn(0)] - public string ImagePath; - - [LoadColumn(1)] - public string Label; - } - - public class ImageNetPrediction - { - [ColumnName("Score")] - public float[] PredictedLabels; - } -} diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 810a308d67..a48ecc6550 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -5,21 +5,30 @@ using System; using System.Collections.Generic; using System.IO; +using System.IO.Compression; using System.Linq; +using System.Net; using System.Runtime.InteropServices; using Microsoft.ML.Data; using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; using Microsoft.ML.TestFramework.Attributes; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Image; using Microsoft.ML.Transforms.TensorFlow; -using Tensorflow; using Xunit; +using Xunit.Abstractions; +using static Microsoft.ML.DataOperationsCatalog; namespace Microsoft.ML.Scenarios { - public partial class ScenariosTests + [Collection("NoParallelization")] + public sealed class TensorFlowScenariosTests : BaseTestClass { + public TensorFlowScenariosTests(ITestOutputHelper output) : base(output) + { + } + private class TestData { [VectorType(4)] @@ -28,6 +37,89 @@ private class TestData public float[] b; } + public class CifarData + { + [LoadColumn(0)] + public string ImagePath; + + [LoadColumn(1)] + public string Label; + } + + public class CifarPrediction + { + [ColumnName("Score")] + public float[] PredictedScores; + } + + public class ImageNetData + { + [LoadColumn(0)] + public string ImagePath; + + [LoadColumn(1)] + public string Label; + } + + public class ImageNetPrediction + { + [ColumnName("Score")] + public float[] PredictedLabels; + } + + [TensorFlowFact] + public void TensorFlowTransforCifarEndToEndTest2() + { + var imageHeight = 32; + var imageWidth = 32; + var model_location = "cifar_model/frozen_model.pb"; + var dataFile = GetDataPath("images/images.tsv"); + var imageFolder = Path.GetDirectoryName(dataFile); + + var mlContext = new MLContext(seed: 1); + var data = TextLoader.Create(mlContext, new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("ImagePath", DataKind.String, 0), + new TextLoader.Column("Label", DataKind.String, 1), + } + }, new MultiFileSource(dataFile)); + + var pipeEstimator = new ImageLoadingEstimator(mlContext, imageFolder, ("ImageReal", "ImagePath")) + .Append(new ImageResizingEstimator(mlContext, "ImageCropped", imageHeight, imageWidth, "ImageReal")) + .Append(new ImagePixelExtractingEstimator(mlContext, "Input", "ImageCropped", interleavePixelColors: true)) + .Append(mlContext.Model.LoadTensorFlowModel(model_location).ScoreTensorFlowModel("Output", "Input")) + .Append(new ColumnConcatenatingEstimator(mlContext, "Features", "Output")) + .Append(new ValueToKeyMappingEstimator(mlContext, "Label")) + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy()); + + + var transformer = pipeEstimator.Fit(data); + var predictions = transformer.Transform(data); + + var metrics = mlContext.MulticlassClassification.Evaluate(predictions); + Assert.Equal(1, metrics.MicroAccuracy, 2); + + var predictFunction = mlContext.Model.CreatePredictionEngine(transformer); + var prediction = predictFunction.Predict(new CifarData() + { + ImagePath = GetDataPath("images/banana.jpg") + }); + Assert.Equal(0, prediction.PredictedScores[0], 2); + Assert.Equal(1, prediction.PredictedScores[1], 2); + Assert.Equal(0, prediction.PredictedScores[2], 2); + + prediction = predictFunction.Predict(new CifarData() + { + ImagePath = GetDataPath("images/hotdog.jpg") + }); + Assert.Equal(0, prediction.PredictedScores[0], 2); + Assert.Equal(0, prediction.PredictedScores[1], 2); + Assert.Equal(1, prediction.PredictedScores[2], 2); + } + [TensorFlowFact] public void TensorFlowTransformMatrixMultiplicationTest() { @@ -1113,5 +1205,183 @@ public void TensorFlowStringTest() Assert.Equal(input.A[i], textOutput.AOut[i]); Assert.Equal(string.Join(" ", input.B).Replace("/", " "), textOutput.BOut[0]); } + + [TensorFlowFact] + public void TensorFlowImageClassification() + { + string assetsRelativePath = @"assets"; + string assetsPath = GetAbsolutePath(assetsRelativePath); + string imagesDownloadFolderPath = Path.Combine(assetsPath, "inputs", + "images"); + + //Download the image set and unzip + string finalImagesFolderName = DownloadImageSet( + imagesDownloadFolderPath); + + string fullImagesetFolderPath = Path.Combine( + imagesDownloadFolderPath, finalImagesFolderName); + + MLContext mlContext = new MLContext(seed: 1); + + //Load all the original images info + IEnumerable images = LoadImagesFromDirectory( + folder: fullImagesetFolderPath, useFolderNameAsLabel: true); + + IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows( + mlContext.Data.LoadFromEnumerable(images), seed: 1); + + shuffledFullImagesDataset = mlContext.Transforms.Conversion + .MapValueToKey("Label") + .Fit(shuffledFullImagesDataset) + .Transform(shuffledFullImagesDataset); + + // Split the data 80:10 into train and test sets, train and evaluate. + TrainTestData trainTestData = mlContext.Data.TrainTestSplit( + shuffledFullImagesDataset, testFraction: 0.2, seed: 1); + + IDataView trainDataset = trainTestData.TrainSet; + IDataView testDataset = trainTestData.TestSet; + + var pipeline = mlContext.Model.ImageClassification( + "ImagePath", "Label", + arch: ImageClassificationEstimator.Architecture.ResnetV2101, + epoch: 5, + batchSize: 5, + learningRate: 0.01f, + testOnTrainSet: false); + + var trainedModel = pipeline.Fit(trainDataset); + + mlContext.Model.Save(trainedModel, shuffledFullImagesDataset.Schema, + "model.zip"); + + ITransformer loadedModel; + DataViewSchema schema; + using (var file = File.OpenRead("model.zip")) + loadedModel = mlContext.Model.Load(file, out schema); + + IDataView predictions = trainedModel.Transform(testDataset); + var metrics = mlContext.MulticlassClassification.Evaluate(predictions); + + // On Ubuntu the results seem to vary quite a bit but they can probably be + // controlled by training more epochs, however that will slow the + // build down. Accuracy values seen were 0.33, 0.66, 0.70+. The model + // seems to be unstable, there could be many reasons, will need to + // investigate this further. + if (!(RuntimeInformation.IsOSPlatform(OSPlatform.Windows) || + (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)))) + { + Assert.InRange(metrics.MicroAccuracy, 0.3, 1); + Assert.InRange(metrics.MacroAccuracy, 0.3, 1); + } + else + { + Assert.Equal(1, metrics.MicroAccuracy); + Assert.Equal(1, metrics.MacroAccuracy); + } + } + + public static IEnumerable LoadImagesFromDirectory(string folder, + bool useFolderNameAsLabel = true) + { + var files = Directory.GetFiles(folder, "*", + searchOption: SearchOption.AllDirectories); + + foreach (var file in files) + { + if (Path.GetExtension(file) != ".jpg") + continue; + + var label = Path.GetFileName(file); + if (useFolderNameAsLabel) + label = Directory.GetParent(file).Name; + else + { + for (int index = 0; index < label.Length; index++) + { + if (!char.IsLetter(label[index])) + { + label = label.Substring(0, index); + break; + } + } + } + + yield return new ImageData() + { + ImagePath = file, + Label = label + }; + + } + } + + public static string DownloadImageSet(string imagesDownloadFolder) + { + string fileName = "flower_photos_tiny_set_for_unit_tests.zip"; + string url = $"https://mlnetfilestorage.file.core.windows.net/imagesets" + + $"/flower_images/flower_photos_tiny_set_for_unit_tests.zip?st=2019" + + $"-08-29T00%3A07%3A21Z&se=2030-08-30T00%3A07%3A00Z&sp=rl&sv=2018" + + $"-03-28&sr=f&sig=N8HbLziTcT61kstprNLmn%2BDC0JoMrNwo6yRWb3hLLag%3D"; + + Download(url, imagesDownloadFolder, fileName); + UnZip(Path.Combine(imagesDownloadFolder, fileName), imagesDownloadFolder); + + return Path.GetFileNameWithoutExtension(fileName); + } + + private static bool Download(string url, string destDir, string destFileName) + { + if (destFileName == null) + destFileName = url.Split(Path.DirectorySeparatorChar).Last(); + + Directory.CreateDirectory(destDir); + + string relativeFilePath = Path.Combine(destDir, destFileName); + + if (File.Exists(relativeFilePath)) + return false; + + new WebClient().DownloadFile(url, relativeFilePath); + return true; + } + + private static void UnZip(String gzArchiveName, String destFolder) + { + var flag = gzArchiveName.Split(Path.DirectorySeparatorChar) + .Last() + .Split('.') + .First() + ".bin"; + + if (File.Exists(Path.Combine(destFolder, flag))) + return; + + ZipFile.ExtractToDirectory(gzArchiveName, destFolder); + File.Create(Path.Combine(destFolder, flag)); + } + + public static string GetAbsolutePath(string relativePath) => + Path.Combine(new FileInfo(typeof( + TensorFlowScenariosTests).Assembly.Location).Directory.FullName, relativePath); + + + public class ImageData + { + [LoadColumn(0)] + public string ImagePath; + + [LoadColumn(1)] + public string Label; + } + + public class ImagePrediction + { + [ColumnName("Score")] + public float[] Score; + + [ColumnName("PredictedLabel")] + public UInt32 PredictedLabel; + } + } } diff --git a/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs b/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs index b3838a791e..2a78910617 100644 --- a/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/TensorFlowEstimatorTests.cs @@ -17,6 +17,10 @@ namespace Microsoft.ML.Tests { + [CollectionDefinition("NoParallelization", DisableParallelization = true)] + public class NoParallelizationCollection { } + + [Collection("NoParallelization")] public class TensorFlowEstimatorTests : TestDataPipeBase { private class TestData From 56983d545cc441c73e3323b50aa46b8ad8f8ae1f Mon Sep 17 00:00:00 2001 From: Harish Kulkarni Date: Thu, 29 Aug 2019 21:58:02 -0700 Subject: [PATCH 4/6] Syncing upstream fork (#11) * Throw error on incorrect Label name in InferColumns API (#47) * Added sequential grouping of columns * reverted the file * addded infer columns label name checking * added column detection error * removed unsed usings * added quotes * replace Where with Any clause * replace Where with Any clause * Set Nullable Auto params to null values (#50) * Added sequential grouping of columns * reverted the file * added auto params as null * change to the update fields method * First public api propsal (#52) * Includes following 1) Final proposal for 0.1 public API surface 2) Prefeaturization 3) Splitting train data into train and validate when validation data is null 4) Providing end to end samples one each for regression, binaryclassification and multiclass classification * Incorporating code review feedbacks * Revert "Set Nullable Auto params to null values" (#53) * Revert "First public api propsal (#52)" This reverts commit e4a64cf4aeab13ee9e5bf0efe242da3270241bd7. * Revert "Set Nullable Auto params to null values (#50)" This reverts commit 41c663cd14247d44022f40cf2dce5977dbab282d. * AutoFit return type is now an IEnumerable (#55) AutoFit returns is now an IEnumerable - this enables many good things Implementing variety of early stopping criteria (See sample) Early discard of models that are no good. This improves memory usage efficiency. (See sample) No need to implement a callback to get results back Getting best score is now outside of API implementation. It is a simple math function to compare scores (See sample). Also templatized the return type for better type safety through out the code. * misc fixes & test additions, towards 0.1 release (#56) * Enable UnitTests on build server (#57) * 1) Making trainer name public (#62) 2) Fixing up samples to reflect it * Initial version of CLI tool for mlnet (#61) * added global tool initial project * removed unneccesary files, renamed files * refactoring and added base abstract classes for trainer generator * removed unused class * Added classes for transforms * added transform generate dummy classes * more refactoring, added first transform * more refactoring and added classes * changed the project structure * restructing added options class * sln changes * refactored options to different class: * added more logic for code generation of class * misc changes * reverted file * added commandline api package * reverted sample * added new command line api parser * added normalization of column names * Added command defaults and error message * implementation of all trainers * changed auto to null * added all transform generators * added error handling when args is empty and minor changes due to change in AutoML api names * changed the name of param * added new command line options and restructuring code * renamed proj file and added solution * Added code to generate usings, Fixed few bugs in the code * added validation to the command line options * changed project name * Bug fixes due to API change in AutoML * changed directory structure * added test framework and basic tests * added more tests * added improvements to template and error handling * renamed the estimator name * fixed test case * added comments * added headers * changed namespace and removed unneccesary properties from project * Revert "changed namespace and removed unneccesary properties from project" This reverts commit 9edae033e9845e910f663f296e168f1182b84f5f. * fixed test cases and renamed namespaces * cleaned up proj file * added folder structure * added symbols/tokens for strings * added more tests * review comments * modified test cases * review comments * change in the exception message * normalized line endings * made method private static * simplified range building /optimization * minor fix * added header * added static methods in command where necessary * nit picks * made few methods static * review comments * nitpick * remove line pragmas * fix test case * Use better AutiFit overload and ignore Multiclass (#64) * Upgrading CLI to produce ML.NET V.10 APIs and bunch of Refactoring tasks (#65) * Added sequential grouping of columns * reverted the file * upgrade to v .10 and refactoring * added null check * fixed unit tests * review comments * removed the settings change * added regions * fixed unit tests * Upgrade ML.NET package to 0.10.0 (#70) * Change in template to accomodate new API of TextLoader (#72) * Added sequential grouping of columns * reverted the file * changed to new API of Text Loader * changed signature * added params for taking additional settings * changes to codegen params * refactoring of templates and fixing errors * Enable gated check for mlnet.tests (#79) * Added sequential grouping of columns * reverted the file * changed to new API of Text Loader * changed signature * added params for taking additional settings * changes to codegen params * refactoring of templates and fixing errors * added run-tests.proj and referred it in build.proj * CLI tool - make validation dataset optional and support for crossvalidation in generated code (#83) * Added sequential grouping of columns * reverted the file * bug fixes, more logic to templates to support cross-validate * formatting and fix type in consolehelper * Added logic in templates * revert settings * benchmarking related changes (#63) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * fix fast forest learner (don't sweep over learning rate) (#88) * Made changes to Have non-calibrated scoring for binary classifiers (#86) * Added sequential grouping of columns * reverted the file * added calibration workaround * removed print probability * reverted settings * rev ColumnInference API: can take label index; rev output object types; add tests (#89) * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (#99) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (will probably need tweaks once I try to use the pipleline) * publish nuget (#101) * use dotnet-internal-temp agent for internal build * use dotnet-internal feed * Fix Codegen for columnConvert and ValueToKeyMapping transform and add individual transform tests (#95) * Added sequential grouping of columns * reverted the file * fix usings for type convert * added transforms tests * review comments * When generating usings choose only distinct usings directives (#94) * Added sequential grouping of columns * reverted the file * Added code to have unique strings * refactoring * minor fix * minor fix * Autofit overloads + cancellation + progress callbacks 1) Introduce AutoFit overloads (basic and advanced) 2) AutoFit Cancellation 3) AutoFit progress callbacks * Default the kfolds to value 5 in CLI generated code (#115) * Added sequential grouping of columns * reverted the file * Set up CI with Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * remove file * added kfold param and defaulted to value * changed type * added for regression * Remove extra ; from generated code (#114) * Added sequential grouping of columns * reverted the file * Set up CI with Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * removed extra ; from generated code * removed file * fix unit tests * TimeoutInSeconds (#116) Specifying timeout in seconds instead of minutes * Added more command line args implementation to CLI tool and refactoring (#110) * Added sequential grouping of columns * reverted the file * Set up CI with Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * Update azure-pipelines.yml for Azure Pipelines * added git status * reverted change * added codegen options and refactoring * minor fixes' * renamed params, minor refactoring * added tests for commandline and refactoring * removed file * added back the test case * minor fixes * Update src/mlnet.Test/CommandLineTests.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * review comments * capitalize the first character * changed the name of test case * remove unused directives * Fail gracefully if unable to instantiate data view with swept parameters (#125) * gracefully fail if fail to parse a datai * rev * validate AutoFit 'Features' column must be of type R4 (#132) * Samples: exceptions / nits (#124) * Logging support in CLI + Implementation of cmd args [--name,--output,--verbosity] (#121) * addded logging and helper methods * fixing code after merge * added resx files, added logger framework, added logging messages * added new options * added spacing * minor fixes * change command description * rename option, add headers, include new param in test * formatted * build fix * changed option name * Added NlogConfig file * added back config package * fix tests * added correct validation check (#137) * Use CreateTextLoader(..) instead of CreateTextLoader(..) (#138) * added support to loaddata by class in the generated code * fix tests * changed CreateTextLoader to ReadFromTextFile method. (#140) * changed textloader to readfromtextfile method * formatting * exception fixes (#136) * infer purpose of hidden columns as 'ignore' (#142) * Added approval tests and bunch of refactoring of code and normalizing namespaces (#148) * changed textloader to readfromtextfile method * formatting * added approval tests and refactoring of code * removed few comments * API 2.0 skeleton (#149) Incorporating API review feedback * The CV code should come before the training when there is no test dataset in generated code (#151) * reorder cv code * build fix * fixed structure * Format the generated code + bunch of misc tasks (#152) * added formatting and minor changes for reordering cv * fixing the template * minor changes * formatting changes * fixed approval test * removed unused nuget * added missing value replacing * added test for new transform * fix test * Update src/mlnet/Templates/Console/MLCodeGen.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Sanitize the column names in CLI (#162) * added sanitization layer in CLI * fix test * changed exception.StackTrace to exception.ToString() * fix package name (#168) * Rev public API (#163) * Rename TransformGeneratorBase .cs to TransformGeneratorBase.cs (#153) * Fix minor version for the repository + remove Nlog config package (#171) * changed the minor version * removed the nlog config package * Added new test to columninfo and fixing up API (#178) * Make optimizing metric customizable and add trainer whitelist functionality (#172) * API rev (#181) * propagate root MLContext thru AutoML (instead of creating our own) (#182) * Enabling new command line args (#183) * fix package name * initial commit * added more commandline args * fixed tests * added headers * fix tests * fix test * rename 'AutoFitter' to 'Experiment' (#169) * added tests (#187) * rev InferColumns to accept ColumnInfo input param (#186) * Implement argument --has-header and change usage of dataset (#194) * added has header and fixed dataset and train dataset * fix tests * removed dummy command (#195) * Fix bug for regression and sanitize input label from user (#198) * removed dummy command * sanitize label and fix template * fix tests * Do not generate code concatenating columns when the dataset has a single feature column (#191) * Include some missed logging in the generated code. (#199) * added logging messages for generated code * added log messages * deleted file * cleaning up proj files (#185) * removed platform target * removed platform target * Some spaces and extra lines + bug in output path (#204) * nit picks * nit picks * fix test * accept label from user input and provide in generated code (#205) * Rev handling of weight / label columns (#203) * migrate to private ML.NET nuget for latest bug fixes (#131) * fix multiclass with nonstandard label (#207) * Multiclass nondefault label test (#208) * printing escaped chars + bug (#212) * delete unused internal samples (#211) * fix SMAC bug that causes multiclass sample to infinite loop (#209) * Rev user input validation for new API (#210) * added console message for exit and nit picks (#215) * exit when exception encountered (#216) * Seal API classes (and make EnableCaching internal) (#217) * Suggested sample nits (feel free to ask for any of these to be reverted) (#219) * User input column type validation (#218) * upgrade commandline and renaming (#221) * upgrade commandline and renaming * renaming fields * Make build.sh, init-tools.sh, & run.sh executable on OSX/Linux (#225) * CLI argument descriptions updated (#224) * CLI argument descriptions updated * No version in .csproj * added flag to disable training code (#227) * Exit if perfect model produced (#220) * removed header (#228) * removed header * added auto generated header * removed console read key (#229) * Fix model path in generated file (#230) * removed console read key * fix model path * fix test * reorder samples (#231) * remove rule that infers column purpose as categorical if # of distinct values is < 100 (#233) * Null reference exception fix for finding best model when some runs have failed (#239) * samples fixes (#238) * fix for defaulting Averaged Perceptron # of iterations to 10 (#237) * Bug bash feedback Feb 27. API changes and sample changes (#240) * Bug bash feedback Feb 27. API changes Sample changes Exception fix * Samples / API rev from 2/27 bug bash feedback (#242) * changed the directory structure for generated project (#243) * changed the directory structure for generated project * changed test * upgraded commandline package * Fix test file locations on OSX (#235) * fix test file locations on OSX * changing to Path.Combine() * Additional Path.Combine() * Remove ConsoleCodeGeneratorTests.GeneratedTrainCodeTest.received.txt * Additional Path.Combine() * add back in double comparison fix * remove metrics agent NaN returns * test fix * test format fix * mock out path Thanks to @daholste for additional fixes! * upgrade to latest ML.NET public surface (#246) * Upgrade to ML.NET 0.11 (#247) * initial changes * fix lightgbm * changed normalize method * added tests * fix tests * fix test * Private preview final API changes (#250) * .NET framework design guidelines applied to public surface * WhitelistedTrainers -> Trainers * Add estimator to public API iteration result (#248) * LightGBM pipeline serialization fix (#251) * Change order that we search for TextLoader's parameters (#256) * CLI IFileInfo null exception fix (#254) * Averaged Perceptron pipeline serialization fix (#257) * Upgrade command-line-api and default folder name change (#258) * change in defautl folderName * upgrade command line * Update src/mlnet/Program.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * eliminate IFileInfo from CLI (#260) * Rev samples towards private preview; ignored columns fix (#259) * remove unused methods in consolehelper and nit picks in generated code (#261) * nit picks * change in console helper * fix tests * add space * fix tests * added nuget sources in generated csproj (#262) * added nuget sources in csproj * changed the structure in generated code * space * upgrade to mlnet 0.11 (#263) * Formatting CLI metrics (#264) Ensures space between printed metrics (also model counter). Right aligned metrics. Extended AUC to four digits. * Add implementation of non -ova multi class trainers code gen (#267) * added non ova multi class learners * added tests * test cases * Add caching (#249) * AdvancedExperimentSettings sample nits (#265) * Add sampling key column (#268) * Initial work for multi-class classification support for CLI (#226) * Initial work for multi-class classification support for CLI * String updates * more strings * Whitelist non-OVA multi-class learners * Refactor the orchestration of AutoML calls (#272) * Do not auto-group columns with suggested purpose = 'Ignore' (#273) * Fix: during type inferencing, parse whitespace strings as NaN (#271) * Printing additional metrics in CLI for binary classification (#274) * Printing additional metrics in CLI for binary classification * Update src/mlnet/Utilities/ConsolePrinter.cs * Add API option to store models on disk (instead of in memory); fix IEstimator memory leak (#269) * Print failed iterations in CLI (#275) * change the type to float from double (#277) * cache arg implementation in CLI (#280) * cache implementation * corrected the null case * added tests for all cases * Remove duplicate value-to-key mapping transform for multiclass string labels (#283) * Add post-trainer transform SDK infra; add KeyToValueMapping transform to CLI; fix: for generated multiclass models, convert predicted label from key to original label column type (#286) * Implement ignore columns command line arg (#290) * normalize line endings * added --ignore-columns * null checks * unit tests * Print winning iteration and runtime in CLI (#288) * Print best metric and runtime * Print best metric and runtime * Line endings in AutoMLEngine.cs * Rename time column to duration to match Python SDK * Revert to MicroAccuracy and MacroAccuracy spellings * Revert spelling of BinaryClassificationMetricsAgent to BinaryMetricsAgent to reduce merge conflicts * Revert spelling of MulticlassMetricsAgent to MultiMetricsAgent to reduce merge conflicts * missed some files * Fix merge conflict * Update AutoMLEngine.cs * Add MacOS & Linux to CI; MacOS & Linux test fixes (#293) * MicroAccuracy as default for multi-class (#295) Change default optimization metric for multi-class classification to MicroAccuracy (accuracy). Previously it was set to MacroAccuracy. * Null exception for ignorecolumns in CLI (#294) * Null exception for ignorecolumns in CLI * Check if ignore-columns array has values (as the default is now a empty array) * Emit caching flag in pipeline object model. (Includes SuggestedPipelineBuilder refactor & debug string fixes / refactor) (#296) * removed sln (#297) * Caching enabling in code gen part -2 (#298) * add * added caching codegen * support comma separated values for --ignore-columns (#300) * default initialization for ignore columns (#302) * default initialization * adde null check * Codegen for multiclass non-ova (#303) * changes to template * multicalss codegen * test cases * fix test cases * Generated Project new structure. (#305) * added new templates * writing files to disck * change path * added new templates * misisng braces * fix bugs * format code * added util methods for solution file creation and addition of projects to it * added extra packages to project files * new tests * added correct path for sln * build fix * fix build * include using system in prediction class (#307) * added using * fix test * Random number generator is not thread safe (#310) * Random number generator is not thread safe * Another local random generator * Missed a few references * Referncing AutoMlUtils.random instead of a local RNG * More refs to mail RNG; remove Float as per https://github.com/dotnet/machinelearning/issues/1669 * Missed Random.cs * Fix multiclass code gen (#314) * compile error in codegen * removes scores printing * fix bugs * fix test * Fix compile error in codegen project (#319) * removed redundant code * fix test case * Rev OVA pipeline node SDK output: wrap binary trainers as children inside parent OVA node (#317) * Ova Multi class codegen support (#321) * dummy * multiova implementation * fix tests * remove inclusion list * fix tests and console helper * Rev run result trainer name for OVA: output different trainer name for each OVA + binary learner combination (#322) * Rev run result trainer name for Ova: output different trainer name for each Ova + binary learner combination * test fixes * Console helper bug in generated code for multiclass (#323) * fix * fix test * looping perlogclass * fix test * Initial version of Progress bar impl and CLI UI experience (#325) * progressbar * added progressbar and refactoring * reverted * revert sign assembly * added headers and removed exception rethrow * Setting model directory to temp directory (#327) * Suggested changes to progress bar (#335) * progressbar * added progressbar and refactoring * reverted * revert sign assembly * added headers and removed exception rethrow * bug fixes and updates to UI * added friendly name printing for metric * formatting * Rev Samples (#334) * Telemetry2 (#333) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (will probably need tweaks once I try to use the pipleline) * tweak queue in vsts-ci.yml * CLI telemetry implementation * Telemetry implementation * delete unnecessary file and change file size bucket to actually log log2 instead of nearest ceil value * add headers, remove comments * one more header missing * Fix progress bar in linux/osx (#336) * progressbar * added progressbar and refactoring * reverted * revert sign assembly * added headers and removed exception rethrow * bug fixes and updates to UI * added friendly name printing for metric * formatting * change from task to thread * Update src/mlnet/CodeGenerator/CodeGenerationHelper.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Mem leak fix (#328) * Create test.txt * Create test.txt * changes needed for benchmarking * forgot one file * merge conflict fix * fix build break * back out my version of the fix for Label column issue and fix the original fix * bogus file removal * undo SuggestedPipeline change * remove labelCol from pipeline suggester * fix build break * rename AutoML to Microsoft.ML.Auto everywhere and a shot at publishing nuget package (will probably need tweaks once I try to use the pipleline) * tweak queue in vsts-ci.yml * there is still investigation to be done but this fix works and solves memory leak problems * minor refactor * Upgrade ML.NET package (#343) * Add cross-validation (CV), and auto-CV for small datasets; push common API experiment methods into base class (#287) * restore old yml for internal pipeline so we can publish nuget again to devdiv stream (#344) * Polishing the CLI UI part-1 (#338) * formatting of pbar message * Polishing the UI * optimization * rename variable * Update src/mlnet/AutoML/AutoMLEngine.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Update src/mlnet/CodeGenerator/CodeGenerationHelper.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * new message * changed hhtp to https * added iteration num + 1 * change string name and add color to artifacts * change the message * build errors * added null checks * added exception messsages to log file * added exception messsages to log file * CLI ML.NET version upgrade (#345) * Sample revs; ColumnInformation property name revs; pre-featurizer fixes (#346) * CLI -- consume logs from AutoML SDK (#349) * Rename RunDetails --> RunDetail (#350) * command line api upgrade and progress bar rendering bug (#366) * added fix for all platforms progress bar * upgrade nuget * removed args from writeline * change in the version (#368) * fix few bugs in progressbar and verbosity (#374) * fix few bugs in progressbar and verbosity * removed unused name space * Fix for folders with space in it while generating project (#376) * support for folders with spaces * added support for paths with space * revert file * change name of var * remove spaces * SMAC fix for minimizing metrics (#363) * Formatting Regression metrics and progress bar display days. (#379) * added progress bar day display and fix regression metrics * fix formatting * added total time * formatted total time * change command name and add pbar message (#380) * change command name and add pbar message * fix tests * added aliases * duplicate alias * added another alias for task * UI missing features (#382) * added formatting changes * added accuracy specifically * downgrade the codepages (#384) * Change in project structure (#385) * initial changes * Change in project structure * correcting test * change variable name * fix tests * fix tests * fix more tests * fix codegen errors * adde log file message * changed name of args * change variable names * fix test * FileSizeBuckets in correct units (#387) * Minor telemetry change to log in correct units and make our life easier in the future * Use Ceiling instead of Round * changed order (#388) * prep work to transfer to ml.net (#389) * move test projects to top level test subdir * rename some projects to make naming consistent and make it build again * fix test project refs * Add AutoML components to build, fix issues related to that so it builds * fix test cases, remove AppInsights ref from AutoML (#3329) * [AutoML] disable netfx build leg for now (#3331) * disable netfx build leg for now * disable netfx build leg for now. * [AutoML] Add AutoML XML documentation to all public members; migrate AutoML projects & tests into ML.NET solution; AutoML test fixes (#3351) * [AutoML] Rev AutoML public API; add required native references to AutoML projects (#3364) * [AutoML] Minor changes to generated project in CLI based on feedback (#3371) * nitpicks for generated project * revert back the target framework * [AutoML] Migrate AutoML back to its own solution, w/ NuGet dependencies (#3373) * Migrate AutoML back to its own solution, w/ NuGet dependencies * build project updates; parameter name revert * dummy change * Revert "dummy change" This reverts commit 3e8574266f556a4d5b6805eb55b4d8b8b84cf355. * [AutoML] publish AutoML package (#3383) * publish AutoML package * Only leave automl and mlnet tests to run * publish AutoML package * Only leave automl and mlnet tests to run * fix build issues when ml.net is not building * bump version to 0.3 since that's the one we're going to ship for build (#3416) * [AutoML] temporarily disable all but x64 platforms -- don't want to do native builds and can't find a way around that with the current VSTS pipeline (#3420) * disable steps but keep phases to keep vsts build pipeline happy (#3423) * API docs for experimentation (#3484) * fixed path bug and regression metrics correction (#3504) * changed the casing of option alias as it conflicts with --help (#3554) * [AutoML] Generated project - FastTree nuget package inclusion dynamically (#3567) * added support for fast tree nuget pack inclusion in generated project * fix testcase * changed the tool name in telemetry message * dummy commit * remove space * dummy commit to trigger build * [AutoML] Add AutoML example code (#3458) * AutoML PipelineSuggester: don't recommend pipelines from first-stage trainers that failed (#3593) * InferColumns API: Validate all columns specified in column info exist in inferred data view (#3599) * [AutoML] AutoML SDK API: validate schema types of input IDataView (#3597) * [AutoML] If first three iterations all fail, short-circuit AutoML experiment (#3591) * mlnet CLI nupkg creation/signing (#3606) * mlnet CLI nupkg creation/signing * relmove includeinpackage from mlnet csproj * address PR comments -- some minor reshuffling of stuff * publish symbols for mlnet CLI * fix case in NLog.config * [AutoML] rename Auto to AutoML in namespace and nuget (#3609) * mlnet CLI nupkg creation/signing * [AutoML] take dependency on a specific ml.net version (#3610) * take dependency on a specific ml.net version * catch up to spelling fix for OptimizationTolerance * force a specific ml.net nuget version, fix typo (#3616) * [AutoML] Fix error handling in CLI. (#3618) * fix error handling * renaming variables * [AutoML] turn off line pragmas in .tt files to play nice with signing (#3617) * turn off line pragmas in .tt files to play nice with signing * dedupe tags * change the param name (#3619) * [AutoML] return null instead of null ref crash on Model property accessor (#3620) * return null instead of null ref crash on Model property accessor * [AutoML] Handling label column names which have space and exception logging (#3624) * fix case of label with space and exception logging * final handler * revert file * use Name instead of FullName for telemetry filename hash (#3633) * renamed classes (#3634) * change ML.NET dependency to 1.0 (#3639) [AutoML] undo pinning ML.NET dependency * set exploration time default in CLI to half hour (#3640) * [AutoML] step 2 of removing pinned nupkg versions (#3642) * InferColumns API that consumes label column index -- Only rename label column to 'Label' for headerless files (#3643) * [AutoML] Upgrade ml.net package in generated code (#3644) * upgrade the mlnet package in gen code * Update src/mlnet/Templates/Console/ModelProject.cs Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * Update src/mlnet/Templates/Console/ModelProject.tt Co-Authored-By: srsaggam <41802116+srsaggam@users.noreply.github.com> * added spaces * [AutoML] Early stopping in CLI based on the exploration time (#3641) * early stopping in CLI * remove unused variables * change back to thread * remove sleep * fix review comments * remove ununsed usings * format message * collapse declaration * remove unused param * added environment.exit and removal of error message * correction in message * secs-> seconds * exit code * change value to 1 * reverse the declaration * [AutoML] Change wording for CouldNotFinshOnTime message (#3655) * set exploration time default in CLI to half hour * [AutoML] Change wording for CouldNotFinshOnTime message * [AutoML] Change wording for CouldNotFinshOnTime message * even better wording for CouldNotFinshOnTime * temp change to get around vsts publish failure (#3656) * [AutoML] bump version to 0.4.0 (#3658) * implement culture invariant strings (#3725) * reset culture (#3730) * [AutoML] Cross validation fixes; validate empty training / validation input data (#3794) * [AutoML] Enable style cop rules & resolve errors (#3823) * add task agnostic wrappers for autofit calls (#3860) * [AutoML] CLI telemetry rev (#3789) * delete automl .sln * CLI -- regenerate templated CS files (#3954) * [AutoML] Bump ML.NET package version to 1.2.0 in AutoML API and CLI; and AutoML package versions to 0.14.0 (#3958) * Build AutoML NuGet package (#3961) * Increment AutoML build version to 0.15.0 for preview. (#3968) * added culture independent parsing (#3731) * - convert tests to xunit - take project level dependency on ML.NET components instead of nuget - set up bestfriends relationship to ML.Core and remove some of the copies of util classes from AutoML.NET (more work needed to fully remove them, work item 4064) - misc build script changes to address PR comments * address issues only showing up in a couple configurations during CI build * fix cut&paste error * [AutoML] Bump version to ML.NET 1.3.1 in AutoML API and CLI and AutoML package version to 0.15.1 (#4071) * bumped version * change versions in nupkg * revert version bump in branch props * [AutoML] Fix for Exception thrown in cross val when one of the score equals infinity. (#4073) * bumped version * change versions in nupkg * revert version bump in branch props * added infinity fix * changes signing (#4079) * Addressed PR comments and build issues - sync block on creating test data file (failed intermittently) - removed classes we copied over from ML.Core and fixed their uses to de-dupe and use original ML.Core versions since we now have InternalsVisible and BestFriends - Fixed nupkg creation to use projects insted of public nuget version for AutoML - Fixed a bunch of unit tests that didn't actually test what they were supposed to test, while removing cut&past code and dependencies. - Few more misc small changes * minor nit - removed unused folder ref * Fix the .sln file for the right configurations. * Fix mistake in .sln file * test fixes and disable one test * fix tests, re-add AutoML samples csproj * bumped VS version to 16 in .sln, removed InternalsVisible for a dead assembly, removed unused references from AutoML test project * Updated docs to include PredictedLabel member (#4107) * Fixed build errors resulting from upgrade to VS2019 compilers * Added additional message describing the previous fix * Updated docs to include PredictedLabel member * Added CODEOWNERS file in the .github/ folder. (#4140) * Added CODEOWNERS file in the .github/ folder. This allows reviewers to review any changes in the machine learning repository * Updated .github/CODEOWNERS with the team instead of individual reviewers * Added AutoML team reviewers (#4144) * Added CODEOWNERS file in the .github/ folder. This allows reviewers to review any changes in the machine learning repository * Updated .github/CODEOWNERS with the team instead of individual reviewers * Added AutoML team reviwers to files owned by AutoML team * Added AutoML team reviwers to files owned by AutoML team * Removed two files that don't exist for AutoML team in CODEOWNERS * Build extension method to reload changes without specifying model name (#4146) * Image classification preview 2. (#4151) * Image classification preview 2. * PR feedback. * Add unit-test. * Add unit-test. * Add unit-test. * Add unit-test. * Use Path.Combine instead of Join. * fix test dataset path. * fix test dataset path. * Improve test. * Improve test. * Increase epochs in tests. * Disable test on Ubuntu. * Move test to its own project. * Move test to its own project. * Move test to its own project. * Move test to its own file. * cleanup. * Disable parallel execution of tensorflow tests. * PR feedback. * PR feedback. * PR feedback. * PR feedback. * Prevent TF test to execute in parallel. * PR feedback. * Build error. * clean up. From 5bb6b8a771c9ac2046ababb2e54ca8ae7c896cce Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Sun, 8 Sep 2019 13:51:23 -0700 Subject: [PATCH 5/6] Added onnx export functionality for MissingValueIndicatorTransformer --- src/Microsoft.ML.OnnxConverter/OnnxUtils.cs | 2 +- .../MissingValueIndicatorTransformer.cs | 43 +++++++++++++- .../ExcludeVariablesInOnnxConversion.txt | 2 +- .../BreastCancer/ModelWithLessIO.txt | 2 +- .../BreastCancer/OneHotBagPipeline.txt | 6 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 57 +++++++++++++++++++ 6 files changed, 105 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs b/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs index c2c4b1eaea..f36b7cbc2e 100644 --- a/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs +++ b/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs @@ -227,7 +227,7 @@ private static TensorProto.Types.DataType ConvertToTensorProtoType(Type rawType) var dataType = TensorProto.Types.DataType.Undefined; if (rawType == typeof(bool)) - dataType = TensorProto.Types.DataType.Float; + dataType = TensorProto.Types.DataType.Bool; else if (rawType == typeof(ReadOnlyMemory)) dataType = TensorProto.Types.DataType.String; else if (rawType == typeof(sbyte)) diff --git a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs index a8058ead78..5df5cb5a6b 100644 --- a/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs +++ b/src/Microsoft.ML.Transforms/MissingValueIndicatorTransformer.cs @@ -10,6 +10,7 @@ using Microsoft.ML.CommandLine; using Microsoft.ML.Data; using Microsoft.ML.Internal.Utilities; +using Microsoft.ML.Model.OnnxConverter; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; @@ -140,7 +141,7 @@ private protected override void SaveModel(ModelSaveContext ctx) private protected override IRowMapper MakeRowMapper(DataViewSchema schema) => new Mapper(this, schema); - private sealed class Mapper : OneToOneMapperBase + private sealed class Mapper : OneToOneMapperBase, ISaveAsOnnx { private readonly MissingValueIndicatorTransformer _parent; private readonly ColInfo[] _infos; @@ -426,6 +427,46 @@ private void FillValues(int srcLength, ref VBuffer dst, List indices, dst = editor.Commit(); } } + + public bool CanSaveOnnx(OnnxContext ctx) => true; + + public void SaveAsOnnx(OnnxContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + + for (int iinfo = 0; iinfo < _infos.Length; ++iinfo) + { + ColInfo info = _infos[iinfo]; + string inputColumnName = info.InputColumnName; + if (!ctx.ContainsColumn(inputColumnName)) + { + ctx.RemoveColumn(info.Name, false); + continue; + } + + if (!SaveAsOnnxCore(ctx, iinfo, info, ctx.GetVariableName(inputColumnName), + ctx.AddIntermediateVariable(_infos[iinfo].OutputType, info.Name))) + { + ctx.RemoveColumn(info.Name, true); + } + } + } + + private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string srcVariableName, string dstVariableName) + { + var inputType = _infos[iinfo].InputType; + Type rawType = (inputType is VectorDataViewType vectorType) ? vectorType.ItemType.RawType : inputType.RawType; + + if (rawType != typeof(float)) + return false; + + string opType; + opType = "IsNaN"; + var isNaNOutput = ctx.AddIntermediateVariable(BooleanDataViewType.Instance, "IsNaNOutput", true); + var nanNode = ctx.CreateNode(opType, srcVariableName, dstVariableName, ctx.GetNodeName(opType), ""); + + return true; + } } } diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt index f0795a1f13..0e43749793 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ExcludeVariablesInOnnxConversion.txt @@ -459,7 +459,7 @@ "name": "PredictedLabel0", "type": { "tensorType": { - "elemType": "FLOAT", + "elemType": "BOOL", "shape": { "dim": [ { diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt index 22aee806af..e0decf5739 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/ModelWithLessIO.txt @@ -786,7 +786,7 @@ "name": "PredictedLabel0", "type": { "tensorType": { - "elemType": "FLOAT", + "elemType": "BOOL", "shape": { "dim": [ { diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt index 68335b20ad..5d88daca32 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt @@ -414,7 +414,7 @@ "name": "Label", "type": { "tensorType": { - "elemType": "FLOAT", + "elemType": "BOOL", "shape": { "dim": [ { @@ -470,7 +470,7 @@ "name": "Label0", "type": { "tensorType": { - "elemType": "FLOAT", + "elemType": "BOOL", "shape": { "dim": [ { @@ -542,7 +542,7 @@ "name": "PredictedLabel0", "type": { "tensorType": { - "elemType": "FLOAT", + "elemType": "BOOL", "shape": { "dim": [ { diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 138d49f5d4..fad37b6078 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -740,6 +740,63 @@ public void OnnxTypeConversionTest() } } + private class TransformedDataPoint : DataPoint, IEquatable + { + [VectorType(3)] + public int[] MissingIndicator { get; set; } + + public bool Equals(TransformedDataPoint other) + { + return Enumerable.SequenceEqual(MissingIndicator, other.MissingIndicator); + } + } + + [Fact] + void IndicateMissingValuesOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + + var samples = new List() + { + new DataPoint() { Features = new float[3] {1, 1, 0}, }, + new DataPoint() { Features = new float[3] {0, float.NaN, 1}, }, + new DataPoint() { Features = new float[3] {-1, float.NaN, float.PositiveInfinity}, }, + }; + var dataView = mlContext.Data.LoadFromEnumerable(samples); + + // IsNaN outputs a binary tensor. Support for this has been added in the latest version + // of Onnxruntime, but that hasn't been released yet. + // So we need to convert its type to Int32 until then. + // ConvertType part of the pipeline can be removed once we pick up a new release of the Onnx runtime + + var pipeline = mlContext.Transforms.IndicateMissingValues(new[] { new InputOutputColumnPair("MissingIndicator", "Features"), }) + .Append(mlContext.Transforms.Conversion.ConvertType("MissingIndicator", outputKind: DataKind.Int32)); + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var mlnetData = mlContext.Data.CreateEnumerable(transformedData, false); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + + var onnxFileName = "IndicateMissingValues.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + + SaveOnnxModel(onnxModel, onnxModelPath, null); + + // Compare results produced by ML.NET and ONNX's runtime. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedVectorColumns(model.LastTransformer.ColumnPairs[0].outputColumnName, outputNames[1], transformedData, onnxResult); + } + + Done(); + } + private void CreateDummyExamplesToMakeComplierHappy() { var dummyExample = new BreastCancerFeatureVector() { Features = null }; From cdec913408ed4668a872c07b384d9f5038592e75 Mon Sep 17 00:00:00 2001 From: "Harish S. Kulkarni" Date: Mon, 9 Sep 2019 10:06:39 -0700 Subject: [PATCH 6/6] Added baseline file for comparison on other platforms and fixed definition of supported platforms --- src/Microsoft.ML.OnnxConverter/OnnxUtils.cs | 2 +- .../Onnx/Transforms/IndicateMissingValues.txt | 163 ++++++++++++++++++ test/Microsoft.ML.Tests/OnnxConversionTest.cs | 13 +- 3 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 test/BaselineOutput/Common/Onnx/Transforms/IndicateMissingValues.txt diff --git a/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs b/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs index f36b7cbc2e..77b27f3a92 100644 --- a/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs +++ b/src/Microsoft.ML.OnnxConverter/OnnxUtils.cs @@ -305,7 +305,7 @@ public static ModelProto MakeModel(List nodes, string producerName, s model.IrVersion = (long)OnnxCSharpToProtoWrapper.Version.IrVersion; model.ModelVersion = modelVersion; model.OpsetImport.Add(new OperatorSetIdProto() { Domain = "ai.onnx.ml", Version = 1 }); - model.OpsetImport.Add(new OperatorSetIdProto() { Domain = "", Version = 7 }); + model.OpsetImport.Add(new OperatorSetIdProto() { Domain = "", Version = 9 }); model.Graph = new GraphProto(); var graph = model.Graph; graph.Node.Add(nodes); diff --git a/test/BaselineOutput/Common/Onnx/Transforms/IndicateMissingValues.txt b/test/BaselineOutput/Common/Onnx/Transforms/IndicateMissingValues.txt new file mode 100644 index 0000000000..964d073ed9 --- /dev/null +++ b/test/BaselineOutput/Common/Onnx/Transforms/IndicateMissingValues.txt @@ -0,0 +1,163 @@ +{ + "irVersion": "3", + "producerName": "ML.NET", + "producerVersion": "##VERSION##", + "domain": "machinelearning.dotnet", + "graph": { + "node": [ + { + "input": [ + "Features" + ], + "output": [ + "MissingIndicator" + ], + "name": "IsNaN", + "opType": "IsNaN" + }, + { + "input": [ + "MissingIndicator" + ], + "output": [ + "MissingIndicator0" + ], + "name": "Cast", + "opType": "Cast", + "attribute": [ + { + "name": "to", + "i": "6", + "type": "INT" + } + ] + }, + { + "input": [ + "Features" + ], + "output": [ + "Features0" + ], + "name": "Identity", + "opType": "Identity" + }, + { + "input": [ + "MissingIndicator0" + ], + "output": [ + "MissingIndicator1" + ], + "name": "Identity0", + "opType": "Identity" + } + ], + "name": "model", + "input": [ + { + "name": "Features", + "type": { + "tensorType": { + "elemType": "FLOAT", + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "3" + } + ] + } + } + } + } + ], + "output": [ + { + "name": "Features0", + "type": { + "tensorType": { + "elemType": "FLOAT", + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "3" + } + ] + } + } + } + }, + { + "name": "MissingIndicator1", + "type": { + "tensorType": { + "elemType": "INT32", + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "3" + } + ] + } + } + } + } + ], + "valueInfo": [ + { + "name": "MissingIndicator", + "type": { + "tensorType": { + "elemType": "BOOL", + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "3" + } + ] + } + } + } + }, + { + "name": "MissingIndicator0", + "type": { + "tensorType": { + "elemType": "INT32", + "shape": { + "dim": [ + { + "dimValue": "1" + }, + { + "dimValue": "3" + } + ] + } + } + } + } + ] + }, + "opsetImport": [ + { + "domain": "ai.onnx.ml", + "version": "1" + }, + { + "version": "9" + } + ] +} \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index fad37b6078..9ea36bdadd 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -41,6 +41,11 @@ public OnnxConversionTest(ITestOutputHelper output) : base(output) { } + private bool IsOnnxRuntimeSupported() + { + return Environment.Is64BitProcess && (!RuntimeInformation.IsOSPlatform(OSPlatform.Linux) || AttributeHelpers.CheckLibcVersionGreaterThanMinimum(new System.Version(2, 23))); + } + /// /// In this test, we convert a trained into ONNX file and then /// call to evaluate that file. The outputs of are checked against the original @@ -777,13 +782,16 @@ void IndicateMissingValuesOnnxConversionTest() var mlnetData = mlContext.Data.CreateEnumerable(transformedData, false); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms"); var onnxFileName = "IndicateMissingValues.onnx"; + var onnxTextName = "IndicateMissingValues.txt"; var onnxModelPath = GetOutputPath(onnxFileName); + var onnxTextPath = GetOutputPath(subDir, onnxTextName); - SaveOnnxModel(onnxModel, onnxModelPath, null); + SaveOnnxModel(onnxModel, onnxModelPath, onnxTextPath); // Compare results produced by ML.NET and ONNX's runtime. - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + if (IsOnnxRuntimeSupported()) { // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); @@ -794,6 +802,7 @@ void IndicateMissingValuesOnnxConversionTest() CompareSelectedVectorColumns(model.LastTransformer.ColumnPairs[0].outputColumnName, outputNames[1], transformedData, onnxResult); } + CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); Done(); }