DataOperationsCatalog.Cache(IDataView, String[]) Method

Definition

Creates a lazy in-memory cache of input.

public Microsoft.ML.IDataView Cache (Microsoft.ML.IDataView input, params string[] columnsToPrefetch);
member this.Cache : Microsoft.ML.IDataView * string[] -> Microsoft.ML.IDataView
Public Function Cache (input As IDataView, ParamArray columnsToPrefetch As String()) As IDataView

Parameters

input
IDataView

The input data.

columnsToPrefetch
String[]

The columns that must be cached whenever anything is cached. An empty array or null value means that columns are cached upon their first access.

Returns

Examples

using System;
using Microsoft.ML;
using Microsoft.ML.SamplesUtils;

namespace Samples.Dynamic
{
    public static class Cache
    {
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for except
            // ion tracking and logging, as a catalog of available operations and as
            // the source of randomness.
            var mlContext = new MLContext();

            var data = DatasetUtils.LoadHousingRegressionDataset(mlContext);

            // Time how long it takes to page through the records if we don't cache.
            (int lines, double columnAverage, double elapsedSeconds) =
                TimeToScanIDataView(mlContext, data);

            Console.WriteLine($"Lines={lines}," +
                $"averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds}" +
                $"seconds.");
            // Expected output (time is approximate):
            // Lines=506, averageOfColumn0=564.17 and took 0.314 seconds.

            // Now create a cached view of the data.
            var cachedData = mlContext.Data.Cache(data);

            // Time how long it takes to page through the records the first time
            // they're accessed after a cache is applied. This iteration will be
            // longer than subsequent calls, as the dataset is being accessed and
            // stored for later. Note that this operation may be relatively quick,
            // as the system may have cached the file.
            (lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext,
                cachedData);

            Console.WriteLine($"Lines={lines}," +
                $"averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds}" +
                $"seconds.");
            // Expected output (time is approximate):
            // Lines=506, averageOfColumn0=564.17 and took 0.056 seconds.

            // Time how long it takes to page through the records now that the data
            // is cached. After the first iteration that caches the IDataView,
            // future iterations, like this one, are faster because they are pulling
            // from data cached in memory.
            (lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext,
                cachedData);

            Console.WriteLine(
                $"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took " +
                $"{elapsedSeconds} seconds.");
            // Expected output (time is approximate):
            // Lines=506, averageOfColumn0=564.17 and took 0.006 seconds.
        }

        private static (int lines, double columnAverage, double elapsedSeconds)
            TimeToScanIDataView(MLContext mlContext, IDataView data)
        {
            int lines = 0;
            double columnAverage = 0.0;
            var enumerable = mlContext.Data
                .CreateEnumerable<HousingRegression>(data, reuseRowObject: true);

            var watch = System.Diagnostics.Stopwatch.StartNew();
            foreach (var row in enumerable)
            {
                lines++;
                columnAverage += row.MedianHomeValue + row.CrimesPerCapita +
                    row.PercentResidental + row.PercentNonRetail + row.CharlesRiver
                    + row.NitricOxides + row.RoomsPerDwelling + row.PercentPre40s +
                    row.EmploymentDistance + row.HighwayDistance + row.TaxRate +
                    row.TeacherRatio;
            }
            watch.Stop();
            columnAverage /= lines;
            var elapsed = watch.Elapsed;

            return (lines, columnAverage, elapsed.Seconds);
        }

        /// <summary>
        /// A class to hold the raw housing regression rows.
        /// </summary>
        public sealed class HousingRegression
        {
            public float MedianHomeValue { get; set; }
            public float CrimesPerCapita { get; set; }
            public float PercentResidental { get; set; }
            public float PercentNonRetail { get; set; }
            public float CharlesRiver { get; set; }
            public float NitricOxides { get; set; }
            public float RoomsPerDwelling { get; set; }
            public float PercentPre40s { get; set; }
            public float EmploymentDistance { get; set; }
            public float HighwayDistance { get; set; }
            public float TaxRate { get; set; }
            public float TeacherRatio { get; set; }
        }

    }
}

Remarks

Caching happens per-column. A column is only cached when it is first accessed. In addition, columnsToPrefetch are considered 'always needed', so these columns will be cached the first time any data is requested.

Applies to