components/assist_ranker/generic_logistic_regression_inference_unittest.cc

// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/assist_ranker/generic_logistic_regression_inference.h"
#include "components/assist_ranker/example_preprocessing.h"

#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/protobuf/src/google/protobuf/map.h"

namespace assist_ranker {
using ::google::protobuf::Map;

class GenericLogisticRegressionInferenceTest : public testing::Test {
 protected:
  GenericLogisticRegressionModel GetProto() {
    GenericLogisticRegressionModel proto;
    proto.set_bias(bias_);
    proto.set_threshold(threshold_);

    auto& weights = *proto.mutable_weights();
    weights[scalar1_name_].set_scalar(scalar1_weight_);
    weights[scalar2_name_].set_scalar(scalar2_weight_);
    weights[scalar3_name_].set_scalar(scalar3_weight_);

    auto* one_hot_feat = weights[one_hot_name_].mutable_one_hot();
    one_hot_feat->set_default_weight(one_hot_default_weight_);
    (*one_hot_feat->mutable_weights())[one_hot_elem1_name_] =
        one_hot_elem1_weight_;
    (*one_hot_feat->mutable_weights())[one_hot_elem2_name_] =
        one_hot_elem2_weight_;
    (*one_hot_feat->mutable_weights())[one_hot_elem3_name_] =
        one_hot_elem3_weight_;

    SparseWeights* sparse_feat = weights[sparse_name_].mutable_sparse();
    sparse_feat->set_default_weight(sparse_default_weight_);
    (*sparse_feat->mutable_weights())[sparse_elem1_name_] =
        sparse_elem1_weight_;
    (*sparse_feat->mutable_weights())[sparse_elem2_name_] =
        sparse_elem2_weight_;

    BucketizedWeights* bucketized_feat =
        weights[bucketized_name_].mutable_bucketized();
    bucketized_feat->set_default_weight(bucketization_default_weight_);
    for (const float boundary : bucketization_boundaries_) {
      bucketized_feat->add_boundaries(boundary);
    }
    for (const float weight : bucketization_weights_) {
      bucketized_feat->add_weights(weight);
    }

    return proto;
  }

  const std::string scalar1_name_ = "scalar_feature1";
  const std::string scalar2_name_ = "scalar_feature2";
  const std::string scalar3_name_ = "scalar_feature3";
  const std::string one_hot_name_ = "one_hot_feature";
  const std::string one_hot_elem1_name_ = "one_hot_elem1";
  const std::string one_hot_elem2_name_ = "one_hot_elem2";
  const std::string one_hot_elem3_name_ = "one_hot_elem3";
  const float bias_ = 1.5f;
  const float threshold_ = 0.6f;
  const float scalar1_weight_ = 0.8f;
  const float scalar2_weight_ = -2.4f;
  const float scalar3_weight_ = 0.01f;
  const float one_hot_elem1_weight_ = -1.0f;
  const float one_hot_elem2_weight_ = 5.0f;
  const float one_hot_elem3_weight_ = -1.5f;
  const float one_hot_default_weight_ = 10.0f;
  const float epsilon_ = 0.001f;

  const std::string sparse_name_ = "sparse_feature";
  const std::string sparse_elem1_name_ = "sparse_elem1";
  const std::string sparse_elem2_name_ = "sparse_elem2";
  const float sparse_elem1_weight_ = -2.2f;
  const float sparse_elem2_weight_ = 3.1f;
  const float sparse_default_weight_ = 4.4f;

  const std::string bucketized_name_ = "bucketized_feature";
  const float bucketization_boundaries_[2] = {0.3f, 0.7f};
  const float bucketization_weights_[3] = {-1.0f, 1.0f, 3.0f};
  const float bucketization_default_weight_ = -3.3f;
};

TEST_F(GenericLogisticRegressionInferenceTest, BaseTest) {
  auto predictor = GenericLogisticRegressionInference(GetProto());

  RankerExample example;
  auto& features = *example.mutable_features();
  features[scalar1_name_].set_bool_value(true);
  features[scalar2_name_].set_int32_value(42);
  features[scalar3_name_].set_float_value(0.666f);
  features[one_hot_name_].set_string_value(one_hot_elem1_name_);

  float score = predictor.PredictScore(example);
  float expected_score =
      Sigmoid(bias_ + 1.0f * scalar1_weight_ + 42.0f * scalar2_weight_ +
              0.666f * scalar3_weight_ + one_hot_elem1_weight_);
  EXPECT_NEAR(expected_score, score, epsilon_);
  EXPECT_EQ(expected_score >= threshold_, predictor.Predict(example));
}

TEST_F(GenericLogisticRegressionInferenceTest, UnknownElement) {
  RankerExample example;
  auto& features = *example.mutable_features();
  features[one_hot_name_].set_string_value("Unknown element");

  auto predictor = GenericLogisticRegressionInference(GetProto());
  float score = predictor.PredictScore(example);
  float expected_score = Sigmoid(bias_ + one_hot_default_weight_);
  EXPECT_NEAR(expected_score, score, epsilon_);
}

TEST_F(GenericLogisticRegressionInferenceTest, MissingFeatures) {
  RankerExample example;

  auto predictor = GenericLogisticRegressionInference(GetProto());
  float score = predictor.PredictScore(example);
  // Missing features will use default weights for one_hot features and drop
  // scalar features.
  float expected_score = Sigmoid(bias_ + one_hot_default_weight_);
  EXPECT_NEAR(expected_score, score, epsilon_);
}

TEST_F(GenericLogisticRegressionInferenceTest, UnknownFeatures) {
  RankerExample example;
  auto& features = *example.mutable_features();
  features["foo1"].set_bool_value(true);
  features["foo2"].set_int32_value(42);
  features["foo3"].set_float_value(0.666f);
  features["foo4"].set_string_value(one_hot_elem1_name_);
  // All features except this one will be ignored.
  features[one_hot_name_].set_string_value(one_hot_elem2_name_);

  auto predictor = GenericLogisticRegressionInference(GetProto());
  float score = predictor.PredictScore(example);
  // Unknown features will be ignored.
  float expected_score = Sigmoid(bias_ + one_hot_elem2_weight_);
  EXPECT_NEAR(expected_score, score, epsilon_);
}

TEST_F(GenericLogisticRegressionInferenceTest, Threshold) {
  // In this test, we calculate the score for a given example and set the model
  // threshold to this value. We then add a feature to the example that should
  // tip the score slightly on either side of the treshold and verify that the
  // decision is as expected.

  auto proto = GetProto();
  auto threshold_calculator = GenericLogisticRegressionInference(proto);

  RankerExample example;
  auto& features = *example.mutable_features();
  features[scalar1_name_].set_bool_value(true);
  features[scalar2_name_].set_int32_value(2);
  features[one_hot_name_].set_string_value(one_hot_elem1_name_);

  float threshold = threshold_calculator.PredictScore(example);
  proto.set_threshold(threshold);

  // Setting the model with the calculated threshold.
  auto predictor = GenericLogisticRegressionInference(proto);

  // Adding small positive contribution from scalar3 to tip the decision the
  // positive side of the threshold.
  features[scalar3_name_].set_float_value(0.01f);
  float score = predictor.PredictScore(example);
  // The score is now greater than, but still near the threshold. The
  // decision should be positive.
  EXPECT_LT(threshold, score);
  EXPECT_NEAR(threshold, score, epsilon_);
  EXPECT_TRUE(predictor.Predict(example));

  // A small negative contribution from scalar3 should tip the decision the
  // other way.
  features[scalar3_name_].set_float_value(-0.01f);
  score = predictor.PredictScore(example);
  EXPECT_GT(threshold, score);
  EXPECT_NEAR(threshold, score, epsilon_);
  EXPECT_FALSE(predictor.Predict(example));
}

TEST_F(GenericLogisticRegressionInferenceTest, NoThreshold) {
  auto proto = GetProto();
  // When no threshold is specified, we use the default of 0.5.
  proto.clear_threshold();
  auto predictor = GenericLogisticRegressionInference(proto);

  RankerExample example;
  auto& features = *example.mutable_features();
  // one_hot_elem3 exactly balances the bias, so we expect the pre-sigmoid score
  // to be zero, and the post-sigmoid score to be 0.5 if this is the only active
  // feature.
  features[one_hot_name_].set_string_value(one_hot_elem3_name_);
  float score = predictor.PredictScore(example);
  EXPECT_NEAR(0.5f, score, epsilon_);

  // Adding small contribution from scalar3 to tip the decision on one side or
  // the other of the threshold.
  features[scalar3_name_].set_float_value(0.01f);
  score = predictor.PredictScore(example);
  // The score is now greater than, but still near 0.5. The decision should be
  // positive.
  EXPECT_LT(0.5f, score);
  EXPECT_NEAR(0.5f, score, epsilon_);
  EXPECT_TRUE(predictor.Predict(example));

  features[scalar3_name_].set_float_value(-0.01f);
  score = predictor.PredictScore(example);
  // The score is now lower than, but near 0.5. The decision should be
  // negative.
  EXPECT_GT(0.5f, score);
  EXPECT_NEAR(0.5f, score, epsilon_);
  EXPECT_FALSE(predictor.Predict(example));
}

TEST_F(GenericLogisticRegressionInferenceTest, PreprossessedModel) {
  GenericLogisticRegressionModel proto = GetProto();
  proto.set_is_preprocessed_model(true);
  // Clear the weights to make sure the inference is done by fullname_weights.
  proto.clear_weights();

  // Build fullname weights.
  Map<std::string, float>& weights = *proto.mutable_fullname_weights();
  weights[scalar1_name_] = scalar1_weight_;
  weights[scalar2_name_] = scalar2_weight_;
  weights[scalar3_name_] = scalar3_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      one_hot_name_, one_hot_elem1_name_)] = one_hot_elem1_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      one_hot_name_, one_hot_elem2_name_)] = one_hot_elem2_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      one_hot_name_, one_hot_elem3_name_)] = one_hot_elem3_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      sparse_name_, sparse_elem1_name_)] = sparse_elem1_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      sparse_name_, sparse_elem2_name_)] = sparse_elem2_weight_;
  weights[ExamplePreprocessor::FeatureFullname(bucketized_name_, "0")] =
      bucketization_weights_[0];
  weights[ExamplePreprocessor::FeatureFullname(bucketized_name_, "1")] =
      bucketization_weights_[1];
  weights[ExamplePreprocessor::FeatureFullname(bucketized_name_, "2")] =
      bucketization_weights_[2];
  weights[ExamplePreprocessor::FeatureFullname(
      ExamplePreprocessor::kMissingFeatureDefaultName, one_hot_name_)] =
      one_hot_default_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      ExamplePreprocessor::kMissingFeatureDefaultName, sparse_name_)] =
      sparse_default_weight_;
  weights[ExamplePreprocessor::FeatureFullname(
      ExamplePreprocessor::kMissingFeatureDefaultName, bucketized_name_)] =
      bucketization_default_weight_;

  // Build preprocessor_config.
  ExamplePreprocessorConfig& config = *proto.mutable_preprocessor_config();
  config.add_missing_features(one_hot_name_);
  config.add_missing_features(sparse_name_);
  config.add_missing_features(bucketized_name_);
  (*config.mutable_bucketizers())[bucketized_name_].add_boundaries(
      bucketization_boundaries_[0]);
  (*config.mutable_bucketizers())[bucketized_name_].add_boundaries(
      bucketization_boundaries_[1]);

  auto predictor = GenericLogisticRegressionInference(proto);

  // Build example.
  RankerExample example;
  Map<std::string, Feature>& features = *example.mutable_features();
  features[scalar1_name_].set_bool_value(true);
  features[scalar2_name_].set_int32_value(42);
  features[scalar3_name_].set_float_value(0.666f);
  features[one_hot_name_].set_string_value(one_hot_elem1_name_);
  features[sparse_name_].mutable_string_list()->add_string_value(
      sparse_elem1_name_);
  features[sparse_name_].mutable_string_list()->add_string_value(
      sparse_elem2_name_);
  features[bucketized_name_].set_float_value(0.98f);

  // Inference.
  float score = predictor.PredictScore(example);
  float expected_score = Sigmoid(
      bias_ + 1.0f * scalar1_weight_ + 42.0f * scalar2_weight_ +
      0.666f * scalar3_weight_ + one_hot_elem1_weight_ + sparse_elem1_weight_ +
      sparse_elem2_weight_ + bucketization_weights_[2]);

  EXPECT_NEAR(expected_score, score, epsilon_);
  EXPECT_EQ(expected_score >= threshold_, predictor.Predict(example));
}

}  // namespace assist_ranker