Source code for pyspark.mllib.tree

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import absolute_import

import random

from pyspark import SparkContext, RDD
from pyspark.mllib.common import callMLlibFunc, inherit_doc, JavaModelWrapper
from pyspark.mllib.linalg import _convert_to_vector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import JavaLoader, JavaSaveable

__all__ = ['DecisionTreeModel', 'DecisionTree', 'RandomForestModel',
           'RandomForest', 'GradientBoostedTreesModel', 'GradientBoostedTrees']


class TreeEnsembleModel(JavaModelWrapper, JavaSaveable):
    def predict(self, x):
        """
        Predict values for a single data point or an RDD of points using
        the model trained.

        Note: In Python, predict cannot currently be used within an RDD
              transformation or action.
              Call predict directly on the RDD instead.
        """
        if isinstance(x, RDD):
            return self.call("predict", x.map(_convert_to_vector))

        else:
            return self.call("predict", _convert_to_vector(x))

    def numTrees(self):
        """
        Get number of trees in ensemble.
        """
        return self.call("numTrees")

    def totalNumNodes(self):
        """
        Get total number of nodes, summed over all trees in the
        ensemble.
        """
        return self.call("totalNumNodes")

    def __repr__(self):
        """ Summary of model """
        return self._java_model.toString()

    def toDebugString(self):
        """ Full model """
        return self._java_model.toDebugString()


[docs]class DecisionTreeModel(JavaModelWrapper, JavaSaveable, JavaLoader): """ .. note:: Experimental A decision tree model for classification or regression. """
[docs] def predict(self, x): """ Predict the label of one or more examples. Note: In Python, predict cannot currently be used within an RDD transformation or action. Call predict directly on the RDD instead. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ if isinstance(x, RDD): return self.call("predict", x.map(_convert_to_vector)) else: return self.call("predict", _convert_to_vector(x))
[docs] def numNodes(self): return self._java_model.numNodes()
[docs] def depth(self): return self._java_model.depth()
def __repr__(self): """ summary of model. """ return self._java_model.toString()
[docs] def toDebugString(self): """ full model. """ return self._java_model.toDebugString()
@classmethod def _java_loader_class(cls): return "org.apache.spark.mllib.tree.model.DecisionTreeModel"
[docs]class DecisionTree(object): """ .. note:: Experimental Learning algorithm for a decision tree model for classification or regression. """ @classmethod def _train(cls, data, type, numClasses, features, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) return DecisionTreeModel(model) @classmethod
[docs] def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ Train a DecisionTreeModel for classification. :param data: Training data: RDD of LabeledPoint. Labels are integers {0,1,...,numClasses}. :param numClasses: Number of classes for classification. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: Supported values: "entropy" or "gini" :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :param minInstancesPerNode: Min number of instances required at child nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel Example usage: >>> from numpy import array >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(1.0, [1.0]), ... LabeledPoint(1.0, [2.0]), ... LabeledPoint(1.0, [3.0]) ... ] >>> model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {}) >>> print(model) DecisionTreeModel classifier of depth 1 with 3 nodes >>> print(model.toDebugString()) DecisionTreeModel classifier of depth 1 with 3 nodes If (feature 0 <= 0.0) Predict: 0.0 Else (feature 0 > 0.0) Predict: 1.0 <BLANKLINE> >>> model.predict(array([1.0])) 1.0 >>> model.predict(array([0.0])) 0.0 >>> rdd = sc.parallelize([[1.0], [0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """ return cls._train(data, "classification", numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
@classmethod
[docs] def trainRegressor(cls, data, categoricalFeaturesInfo, impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): """ Train a DecisionTreeModel for regression. :param data: Training data: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: Map from categorical feature index to number of categories. Any feature not in this map is treated as continuous. :param impurity: Supported values: "variance" :param maxDepth: Max depth of tree. E.g., depth 0 means 1 leaf node. Depth 1 means 1 internal node + 2 leaf nodes. :param maxBins: Number of bins used for finding splits at each node. :param minInstancesPerNode: Min number of instances required at child nodes to create the parent split :param minInfoGain: Min info gain required to create a split :return: DecisionTreeModel Example usage: >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import DecisionTree >>> from pyspark.mllib.linalg import SparseVector >>> >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ... ] >>> >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data), {}) >>> model.predict(SparseVector(2, {1: 1.0})) 1.0 >>> model.predict(SparseVector(2, {1: 0.0})) 0.0 >>> rdd = sc.parallelize([[0.0, 1.0], [0.0, 0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """ return cls._train(data, "regression", 0, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
@inherit_doc
[docs]class RandomForestModel(TreeEnsembleModel, JavaLoader): """ .. note:: Experimental Represents a random forest model. """ @classmethod def _java_loader_class(cls): return "org.apache.spark.mllib.tree.model.RandomForestModel"
[docs]class RandomForest(object): """ .. note:: Experimental Learning algorithm for a random forest model for classification or regression. """ supportedFeatureSubsetStrategies = ("auto", "all", "sqrt", "log2", "onethird") @classmethod def _train(cls, data, algo, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" if featureSubsetStrategy not in cls.supportedFeatureSubsetStrategies: raise ValueError("unsupported featureSubsetStrategy: %s" % featureSubsetStrategy) if seed is None: seed = random.randint(0, 1 << 30) model = callMLlibFunc("trainRandomForestModel", data, algo, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed) return RandomForestModel(model) @classmethod
[docs] def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="gini", maxDepth=4, maxBins=32, seed=None): """ Method to train a decision tree model for binary or multiclass classification. :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1, ..., numClasses-1}. :param numClasses: number of classes for classification. :param categoricalFeaturesInfo: Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: Number of features to consider for splits at each node. Supported: "auto" (default), "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "sqrt". :param impurity: Criterion used for information gain calculation. Supported values: "gini" (recommended) or "entropy". :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 4) :param maxBins: maximum number of bins used for splitting features (default: 32) :param seed: Random seed for bootstrapping and choosing feature subsets. :return: RandomForestModel that can be used for prediction Example usage: >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(0.0, [1.0]), ... LabeledPoint(1.0, [2.0]), ... LabeledPoint(1.0, [3.0]) ... ] >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42) >>> model.numTrees() 3 >>> model.totalNumNodes() 7 >>> print(model) TreeEnsembleModel classifier with 3 trees <BLANKLINE> >>> print(model.toDebugString()) TreeEnsembleModel classifier with 3 trees <BLANKLINE> Tree 0: Predict: 1.0 Tree 1: If (feature 0 <= 1.0) Predict: 0.0 Else (feature 0 > 1.0) Predict: 1.0 Tree 2: If (feature 0 <= 1.0) Predict: 0.0 Else (feature 0 > 1.0) Predict: 1.0 <BLANKLINE> >>> model.predict([2.0]) 1.0 >>> model.predict([0.0]) 0.0 >>> rdd = sc.parallelize([[3.0], [1.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """ return cls._train(data, "classification", numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
@classmethod
[docs] def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32, seed=None): """ Method to train a decision tree model for regression. :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param numTrees: Number of trees in the random forest. :param featureSubsetStrategy: Number of features to consider for splits at each node. Supported: "auto" (default), "all", "sqrt", "log2", "onethird". If "auto" is set, this parameter is set based on numTrees: if numTrees == 1, set to "all"; if numTrees > 1 (forest) set to "onethird" for regression. :param impurity: Criterion used for information gain calculation. Supported values: "variance". :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 4) :param maxBins: maximum number of bins used for splitting features (default: 32) :param seed: Random seed for bootstrapping and choosing feature subsets. :return: RandomForestModel that can be used for prediction Example usage: >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import RandomForest >>> from pyspark.mllib.linalg import SparseVector >>> >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ... ] >>> >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42) >>> model.numTrees() 2 >>> model.totalNumNodes() 4 >>> model.predict(SparseVector(2, {1: 1.0})) 1.0 >>> model.predict(SparseVector(2, {0: 1.0})) 0.5 >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) >>> model.predict(rdd).collect() [1.0, 0.5] """ return cls._train(data, "regression", 0, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed)
@inherit_doc
[docs]class GradientBoostedTreesModel(TreeEnsembleModel, JavaLoader): """ .. note:: Experimental Represents a gradient-boosted tree model. """ @classmethod def _java_loader_class(cls): return "org.apache.spark.mllib.tree.model.GradientBoostedTreesModel"
[docs]class GradientBoostedTrees(object): """ .. note:: Experimental Learning algorithm for a gradient boosted trees model for classification or regression. """ @classmethod def _train(cls, data, algo, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" model = callMLlibFunc("trainGradientBoostedTreesModel", data, algo, categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins) return GradientBoostedTreesModel(model) @classmethod
[docs] def trainClassifier(cls, data, categoricalFeaturesInfo, loss="logLoss", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ Method to train a gradient-boosted trees model for classification. :param data: Training dataset: RDD of LabeledPoint. Labels should take values {0, 1}. :param categoricalFeaturesInfo: Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}. :param numIterations: Number of iterations of boosting. (default: 100) :param learningRate: Learning rate for shrinking the contribution of each estimator. The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 3) :param maxBins: maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories :return: GradientBoostedTreesModel that can be used for prediction Example usage: >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> >>> data = [ ... LabeledPoint(0.0, [0.0]), ... LabeledPoint(0.0, [1.0]), ... LabeledPoint(1.0, [2.0]), ... LabeledPoint(1.0, [3.0]) ... ] >>> >>> model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), {}, numIterations=10) >>> model.numTrees() 10 >>> model.totalNumNodes() 30 >>> print(model) # it already has newline TreeEnsembleModel classifier with 10 trees <BLANKLINE> >>> model.predict([2.0]) 1.0 >>> model.predict([0.0]) 0.0 >>> rdd = sc.parallelize([[2.0], [0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """ return cls._train(data, "classification", categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins)
@classmethod
[docs] def trainRegressor(cls, data, categoricalFeaturesInfo, loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, maxBins=32): """ Method to train a gradient-boosted trees model for regression. :param data: Training dataset: RDD of LabeledPoint. Labels are real numbers. :param categoricalFeaturesInfo: Map storing arity of categorical features. E.g., an entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}. :param loss: Loss function used for minimization during gradient boosting. Supported: {"logLoss" (default), "leastSquaresError", "leastAbsoluteError"}. :param numIterations: Number of iterations of boosting. (default: 100) :param learningRate: Learning rate for shrinking the contribution of each estimator. The learning rate should be between in the interval (0, 1]. (default: 0.1) :param maxBins: maximum number of bins used for splitting features (default: 32) DecisionTree requires maxBins >= max categories :param maxDepth: Maximum depth of the tree. E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 3) :return: GradientBoostedTreesModel that can be used for prediction Example usage: >>> from pyspark.mllib.regression import LabeledPoint >>> from pyspark.mllib.tree import GradientBoostedTrees >>> from pyspark.mllib.linalg import SparseVector >>> >>> sparse_data = [ ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})), ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})), ... LabeledPoint(1.0, SparseVector(2, {1: 2.0})) ... ] >>> >>> data = sc.parallelize(sparse_data) >>> model = GradientBoostedTrees.trainRegressor(data, {}, numIterations=10) >>> model.numTrees() 10 >>> model.totalNumNodes() 12 >>> model.predict(SparseVector(2, {1: 1.0})) 1.0 >>> model.predict(SparseVector(2, {0: 1.0})) 0.0 >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]]) >>> model.predict(rdd).collect() [1.0, 0.0] """ return cls._train(data, "regression", categoricalFeaturesInfo, loss, numIterations, learningRate, maxDepth, maxBins)
def _test(): import doctest globs = globals().copy() globs['sc'] = SparkContext('local[4]', 'PythonTest', batchSize=2) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1) if __name__ == "__main__": _test()