| Category MappingGeneXproTools supports all kinds of categorical variables, both as part of entirely 
                            categorical datasets or intermixed with numerical variables. In both cases the 
                            categories in all categorical variables are automatically replaced by numerical values 
                            so that you can start modeling straightaway. 
 GeneXproTools uses simple heuristics to make this initial mapping, but then lets you choose
							more meaningful mappings in the Category Mapping Window.
 
 Dependent categorical variables are also supported but they are handled differently in 
							classification and logistic regression problems with more than two classes. 
							In these cases the 
							mapping is made so that only one class is singled out, resulting in a binomial outcome, 
							such as {0, 1} or {-1, 1}, which can then be used to create classification or 
							logistic regression models. The merging of the response variable in classification and 
							logistic regression is handled in the Class Merging & Discretization Window.
 
 In regression problems, dependent categorical variables are handled exactly as any other 
							categorical variable, that is, the categories in the response variable are also converted 
							to numerical values using user-defined mappings.
 
 The beauty and power of GeneXproTools support for categorical variables goes beyond 
							giving you access to a sophisticated and extremely useful tool for changing and 
							experimenting with different mappings easily and quickly by trying out different scenarios 
							and seeing immediately how they impact on modeling. Indeed GeneXproTools also generates code 
							that supports data in exactly the same format that was loaded into GeneXproTools. This means 
							that all the code generated both for external model deployment or for scoring internally in 
							GeneXproTools, also supports categorical variables. Below is an example in C++ of a 
							logistic regression model with 12 variables, 7 of which are categorical.
 
 							
//------------------------------------------------------------------
// Logistic regression model generated by GeneXproTools 5.0 on 5/17/2013 4:13:11 PM
// GEP File: D:\GeneXproTools\Version5.0\OnlineGuide\LoanRisk_03a.gep
// Training Records:  667
// Validation Records:   333
// Fitness Function:  Bounded ROC, Logistic Threshold
// Training Fitness:  726.124343110239
// Training Accuracy: 76.76% (512)
// Validation Fitness:   777.537892479522
// Validation Accuracy:  79.58% (265)
//------------------------------------------------------------------
#include "math.h"
#include "string.h"
double gepModel(char* d_string[]);
double gep3Rt(double x);
void TransformCategoricalInputs(char* input[], double output[]);
double gepModel(char* d_string[])
{
    const double G3C5 = -4.97848445081942;
    double d[20];
    TransformCategoricalInputs(d_string, d);
    double dblTemp = 0.0;
    dblTemp = (pow(d[0],4)+d[15]);
    dblTemp += exp(d[2]);
    dblTemp += (((d[1]*pow(gep3Rt((d[5]+G3C5)),3))-d[18])-d[8]);
    dblTemp += ((d[9]*(d[10]+d[12]))-(((d[7]*d[11])*d[7])*gep3Rt(d[10])));
    const double SLOPE = 6.9596006314631E-03;
    const double INTERCEPT = 3.45748287482188E-02;
    double probabilityOne = 1.0 / (1.0 + exp(-(SLOPE * dblTemp + INTERCEPT)));
    return probabilityOne;
}
double gep3Rt(double x)
{
    return x < 0.0 ? -pow(-x,(1.0/3.0)) : pow(x,(1.0/3.0));
}
void TransformCategoricalInputs(char* input[], double output[])
{
    if(strcmp("A11", input[0]) == 0)
        output[0] = 1.0;
    else if(strcmp("A12", input[0]) == 0)
        output[0] = 2.0;
    else if(strcmp("A13", input[0]) == 0)
        output[0] = 3.0;
    else if(strcmp("A14", input[0]) == 0)
        output[0] = 4.0;
    else output[0] = 0.0;
    
    
    output[1] = atof(input[1]);
    
    if(strcmp("A30", input[2]) == 0)
        output[2] = 1.0;
    else if(strcmp("A31", input[2]) == 0)
        output[2] = 2.0;
    else if(strcmp("A32", input[2]) == 0)
        output[2] = 3.0;
    else if(strcmp("A33", input[2]) == 0)
        output[2] = 4.0;
    else if(strcmp("A34", input[2]) == 0)
        output[2] = 5.0;
    else output[2] = 0.0;
    
    if(strcmp("A61", input[5]) == 0)
        output[5] = 1.0;
    else if(strcmp("A62", input[5]) == 0)
        output[5] = 2.0;
    else if(strcmp("A63", input[5]) == 0)
        output[5] = 3.0;
    else if(strcmp("A64", input[5]) == 0)
        output[5] = 4.0;
    else if(strcmp("A65", input[5]) == 0)
        output[5] = 5.0;
    else output[5] = 0.0;
    
    
    output[7] = atof(input[7]);
    
    if(strcmp("A91", input[8]) == 0)
        output[8] = 1.0;
    else if(strcmp("A92", input[8]) == 0)
        output[8] = 2.0;
    else if(strcmp("A93", input[8]) == 0)
        output[8] = 3.0;
    else if(strcmp("A94", input[8]) == 0)
        output[8] = 4.0;
    else output[8] = 0.0;
    
    if(strcmp("A101", input[9]) == 0)
        output[9] = 1.0;
    else if(strcmp("A102", input[9]) == 0)
        output[9] = 2.0;
    else if(strcmp("A103", input[9]) == 0)
        output[9] = 3.0;
    else output[9] = 0.0;
    
    
    output[10] = atof(input[10]);
    
    if(strcmp("A121", input[11]) == 0)
        output[11] = 1.0;
    else if(strcmp("A122", input[11]) == 0)
        output[11] = 2.0;
    else if(strcmp("A123", input[11]) == 0)
        output[11] = 3.0;
    else if(strcmp("A124", input[11]) == 0)
        output[11] = 4.0;
    else output[11] = 0.0;
    
    
    output[12] = atof(input[12]);
    
    
    output[15] = atof(input[15]);
    
    if(strcmp("A191", input[18]) == 0)
        output[18] = 1.0;
    else if(strcmp("A192", input[18]) == 0)
        output[18] = 2.0;
    else output[18] = 0.0;
}
See Also:
 
 
 Related Tutorials:
 
 
 Related Videos:
 
 
 
 |