1

I am trying to implement the Random Forest algorithm in Python using the script from this article https://machinelearningmastery.com/implement-random-forest-scratch-python/ and modifying it according to my dataset , but I having the following error when I am running my code

Traceback (most recent call last):
  File "C:----\scratch.py", line 211, in <module>
    str_column_to_float(dataset, i)
  File "C:----\scratch.py", line 31, in str_column_to_float
    row[column] = float(row[column].strip())
ValueError: could not convert string to float: male

Is there any good way to fix that?

I tried to convert my attribute male to numeric value in this part of code

def replace_non_numeric(df):
df["Gender"] = df["Gender"].apply(lambda gender: 0 if gender == "male" else 1)
return df

train_df = replace_non_numeric(pd.read_csv("datatrain.csv"))

but the error still occurs

this is my dataset

Id  Age Gender  Race           Result

50  15  male    Bi-Racial           1                                                      

51  14  female  African-American    1

52  16  male    African-American    0

53  18  male    African-American    0

54  19  male    African-American    1

55  16  male    Caucasian           1

56  15  female  African-American    1

57  15  male    African-American    1

and here is the entire code

import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(datatrain):
    dataset = list()
    with open(datatrain, 'r') as fr:
        csv_reader = reader(fr)
        header = next(csv_reader)
        for row in csv_reader:
                if not row:
                    continue
                dataset.append(row)
                return dataset

def replace_non_numeric(df):
    df["Gender"] = df["Gender"].apply(lambda gender: 0 if gender == "male" else 1)
    return df

train_df = replace_non_numeric(pd.read_csv("datatrain.csv"))

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

# Select the best split point for a dataset
def get_split(dataset, n_features):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    while len(features) < n_features:
        index = randrange(len(dataset[0])-1)
        if index not in features:
            features.append(index)
    for index in features:
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left, n_features)
        split(node['left'], max_depth, min_size, n_features, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right, n_features)
        split(node['right'], max_depth, min_size, n_features, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
    root = get_split(train, n_features)
    split(root, max_depth, min_size, n_features, 1)
    return root

# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

# Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
    sample = list()
    n_sample = round(len(dataset) * ratio)
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample

# Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)

# Random Forest Algorithm
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
    trees = list()
    for i in range(n_trees):
        sample = subsample(train, sample_size)
        tree = build_tree(sample, max_depth, min_size, n_features)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return(predictions)

# Test the random forest algorithm
seed(2)
# load and prepare data
filename = 'datatrain.csv'
dataset = load_csv(filename)
# convert string attributes to integers
for i in range(0, len(dataset[0])-1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
for n_trees in [1, 5, 10]:
    scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features)
    print('Trees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

I am trying to get the model that would show how likely a person(Id) is going to result in 0 or 1 based on its demographics.. Please direct me if I do something wrong or maybe I should print something different in order to see a better output

2 Answers 2

1

Calling df["Gender"] won't work because the separator for your csv file are spaces, which you do not specify in train_df = replace_non_numeric(pd.read_csv("datatrain.csv")). By default, read_csv assumes , will be used for seperation.

If you want to use a variable amount of space for seperation, you should use the regular expression \s+. Here is the corresponding code:

def replace_non_numeric(df):
    print(df)
    df["Gender"] = df["Gender"].apply(lambda gender: 0 if gender == "male" else 1)
    print(df)
    return df

train_df = replace_non_numeric(pd.read_csv("datatrain.csv", sep="\s+"))

This will return:

   Id  Age  Gender              Race  Result
0  50   15    male         Bi-Racial       1
1  51   14  female  African-American       1
2  52   16    male  African-American       0
3  53   18    male  African-American       0
4  54   19    male  African-American       1
5  55   16    male         Caucasian       1
6  56   15  female  African-American       1
7  57   15    male  African-American       1

   Id  Age  Gender              Race  Result
0  50   15       0         Bi-Racial       1
1  51   14       1  African-American       1
2  52   16       0  African-American       0
3  53   18       0  African-American       0
4  54   19       0  African-American       1
5  55   16       0         Caucasian       1
6  56   15       1  African-American       1
7  57   15       0  African-American       1
Sign up to request clarification or add additional context in comments.

Comments

0

I used the following for rfc scripting, df_ilpd.Gender[df_ilpd.Gender == 'male'] = 1 this altered the 'male' to '1' in my selected dataframe.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.