SageMaker: Analysis and model training on the Iris dataset¶

Download and read the dataset¶

import urllib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For attributes and the class, see: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names
# For the data, see: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
download_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
file_name = "iris.data"
urllib.request.urlretrieve (download_url, file_name)

('iris.data', <http.client.HTTPMessage at 0x7f7ac95413c8>)

# Read the data into Panda dataframe
df = pd.read_csv('./{}'.format(file_name), names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])

Inspect the data and perform some simple analysis¶

# First few observations
df.head()

# Number of observations
df.count()

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
species         150
dtype: int64

# Number of classes / species
df['species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

# Summary of attributes, data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

# Some statistics - useful or not?
df.describe()

# Possibly more useful - mean values grouped by the species
df.groupby('species').mean()

# Distribution of values for each attribute
plt.figure(1 , figsize = (15 , 6))
n = 0 
for x in ['sepal_length' , 'sepal_width' , 'petal_length', 'petal_width']:
    n += 1
    plt.subplot(1 , 4 , n)
    #plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
    sns.distplot(df[x] , bins = 20)
    plt.title('Distplot of {}'.format(x))
plt.show()

Look for relationships and correlation¶

# Correlation heatmap - petal length and width look to be most associated
plt.figure(figsize=(7,4)) 
sns.heatmap(df.corr(),annot=True)
plt.show()

# An alternative correlation view - shows the strong association between petal length and width
g = sns.pairplot(df, hue='species', markers='+')
plt.show()

Classify using the scikit-learn K-Nearest Neighbor algorithm¶

Prepare the training and test data¶

# Create training and test dataframes based on a random 70/30 split
train_data, test_data = np.split(df.sample(frac=1, random_state=np.random.RandomState()), [int(0.7 * len(df))])

Create a knn model using the training data¶

from sklearn.neighbors import KNeighborsClassifier

# Declare knn classifer; classify based on most common classification of 3 nearest neighbours
knn = KNeighborsClassifier(n_neighbors=3)

# Train knn model
knn.fit(train_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], train_data["species"])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

Predict species using the model and the test data¶

# Predict using the model and the test data
preds_array = knn.predict(test_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

# Convert the array to a dataframe with a single column called pred
preds_df = pd.DataFrame(preds_array, columns=['prediction'])

# Add the pred dataframe to the test dataframe by simply placing side by side
combined_df = test_data.reset_index(drop=True).join(preds_df)

Review the predictions¶

# Predictions look good compared to the known class
combined_df.head(20)

# Prediction of species is largely correct compared to the observed species
pd.crosstab(combined_df['species'], combined_df['prediction'], rownames=['Actual species'], colnames=['Predicted species'])

# Visually, the predictions show what we would expect when compared with the training data pairplot above
sns.pairplot(combined_df, hue='prediction', markers='+')
plt.show()

Classify using the SageMaker k-nearest neighbour algorithm¶

Prepare the training and test data¶

# Reload CSV data into a Panda dataframe
df = pd.read_csv('./{}'.format(file_name), names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])

# Remap species values to integers
df['species'] = df['species'].replace({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

# Move species column to first position
df = pd.concat([df['species'], df.drop(['species'], axis=1)], axis=1)

# Create training and test dataframes based on a random 70/30 split
train_data, test_data = np.split(df.sample(frac=1, random_state=np.random.RandomState()), [int(0.7 * len(df))])

Create a knn model using the training data¶

import boto3
from datetime import datetime
import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker.amazon.amazon_estimator import get_image_uri

# S3 config
bucket = 'iris-2020'
train_fname = 'iris-train.csv'
test_fname = 'iris-test.csv'
output_path = 's3://{}/output'.format(bucket)

# Save training and test data to local notebook instance (without indexes and headers)
train_data.to_csv(train_fname, index=False, header=False)
test_data.to_csv(test_fname, index=False, header=False)

# Save training and test data to S3
boto3.Session().resource('s3').Bucket(bucket).Object("{}/{}".format('train', train_fname)).upload_file(train_fname)
boto3.Session().resource('s3').Bucket(bucket).Object("{}/{}".format('test', test_fname)).upload_file(test_fname)

# Training config
job_name = 'iris-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))

# Declare knn estimator
knn = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "knn"),
                                        get_execution_role(),
                                        train_instance_count=1,
                                        train_instance_type='ml.m4.xlarge',
                                        output_path=output_path,
                                        sagemaker_session=sagemaker.Session())

# Set mandatory hyperparameters; classify based on most common classification of 3 nearest neighbours
knn.set_hyperparameters(predictor_type='classifier',
                           feature_dim=4,
                           k=3,
                           sample_size=len(train_data))

# Define the data type and paths to the training and test data
content_type = "text/csv"
train_input = sagemaker.session.s3_input(s3_data="s3://{}/{}/".format(bucket, 'train'), content_type=content_type)
test_input = sagemaker.session.s3_input(s3_data="s3://{}/{}/".format(bucket, 'test'), content_type=content_type)

# Train the knn model with just training data
knn.fit({'train': train_input},  job_name=job_name)

# Train the knn model with training and validation data
# knn.fit({'train': train_input, 'test': test_input},  job_name=job_name)

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.

2020-11-19 13:11:57 Starting - Starting the training job...
2020-11-19 13:12:00 Starting - Launching requested ML instances......
2020-11-19 13:13:15 Starting - Preparing the instances for training......
2020-11-19 13:14:23 Downloading - Downloading input data......
2020-11-19 13:15:14 Training - Downloading the training image..Docker entrypoint called with argument(s): train
Running default environment configuration script
[11/19/2020 13:15:38 INFO 139720084928320] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'index_metric': u'L2', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'feature_dim': u'auto', u'faiss_index_ivf_nlists': u'auto', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000'}
[11/19/2020 13:15:38 INFO 139720084928320] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {u'sample_size': u'105', u'feature_dim': u'4', u'predictor_type': u'classifier', u'k': u'3'}
[11/19/2020 13:15:38 INFO 139720084928320] Final configuration: {u'index_metric': u'L2', u'predictor_type': u'classifier', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'_log_level': u'info', u'feature_dim': u'4', u'faiss_index_ivf_nlists': u'auto', u'sample_size': u'105', u'epochs': u'1', u'index_type': u'faiss.Flat', u'_faiss_index_nprobe': u'5', u'_kvstore': u'dist_async', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000', u'k': u'3'}
[11/19/2020 13:15:38 WARNING 139720084928320] Loggers have already been setup.
[11/19/2020 13:15:38 INFO 139720084928320] Launching parameter server for role scheduler
[11/19/2020 13:15:38 INFO 139720084928320] {'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'SAGEMAKER_HTTP_PORT': '8080', 'HOME': '/root', 'PYTHONUNBUFFERED': 'TRUE', 'CANONICAL_ENVROOT': '/opt/amazon', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python2.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'LANG': 'en_US.utf8', 'DMLC_INTERFACE': 'eth0', 'SHLVL': '1', 'AWS_REGION': 'us-east-1', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'NVIDIA_VISIBLE_DEVICES': 'void', 'TRAINING_JOB_NAME': 'iris-job-20201119131155', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'ENVROOT': '/opt/amazon', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'OMP_NUM_THREADS': '2', 'HOSTNAME': 'ip-10-0-230-142.ec2.internal', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/01d241cf-c2d1-4de1-b9a6-8a96a56664a4', 'PWD': '/', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-east-1:130636039461:training-job/iris-job-20201119131155', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2'}
[11/19/2020 13:15:38 INFO 139720084928320] envs={'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'DMLC_NUM_WORKER': '1', 'DMLC_PS_ROOT_PORT': '9000', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'SAGEMAKER_HTTP_PORT': '8080', 'HOME': '/root', 'PYTHONUNBUFFERED': 'TRUE', 'CANONICAL_ENVROOT': '/opt/amazon', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python2.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'LANG': 'en_US.utf8', 'DMLC_INTERFACE': 'eth0', 'SHLVL': '1', 'DMLC_PS_ROOT_URI': '10.0.230.142', 'AWS_REGION': 'us-east-1', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'NVIDIA_VISIBLE_DEVICES': 'void', 'TRAINING_JOB_NAME': 'iris-job-20201119131155', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'ENVROOT': '/opt/amazon', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'OMP_NUM_THREADS': '2', 'HOSTNAME': 'ip-10-0-230-142.ec2.internal', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/01d241cf-c2d1-4de1-b9a6-8a96a56664a4', 'DMLC_ROLE': 'scheduler', 'PWD': '/', 'DMLC_NUM_SERVER': '1', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-east-1:130636039461:training-job/iris-job-20201119131155', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2'}
[11/19/2020 13:15:38 INFO 139720084928320] Launching parameter server for role server
[11/19/2020 13:15:38 INFO 139720084928320] {'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'SAGEMAKER_HTTP_PORT': '8080', 'HOME': '/root', 'PYTHONUNBUFFERED': 'TRUE', 'CANONICAL_ENVROOT': '/opt/amazon', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python2.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'LANG': 'en_US.utf8', 'DMLC_INTERFACE': 'eth0', 'SHLVL': '1', 'AWS_REGION': 'us-east-1', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'NVIDIA_VISIBLE_DEVICES': 'void', 'TRAINING_JOB_NAME': 'iris-job-20201119131155', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'ENVROOT': '/opt/amazon', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'OMP_NUM_THREADS': '2', 'HOSTNAME': 'ip-10-0-230-142.ec2.internal', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/01d241cf-c2d1-4de1-b9a6-8a96a56664a4', 'PWD': '/', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-east-1:130636039461:training-job/iris-job-20201119131155', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2'}
[11/19/2020 13:15:38 INFO 139720084928320] envs={'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'DMLC_NUM_WORKER': '1', 'DMLC_PS_ROOT_PORT': '9000', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'SAGEMAKER_HTTP_PORT': '8080', 'HOME': '/root', 'PYTHONUNBUFFERED': 'TRUE', 'CANONICAL_ENVROOT': '/opt/amazon', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python2.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'LANG': 'en_US.utf8', 'DMLC_INTERFACE': 'eth0', 'SHLVL': '1', 'DMLC_PS_ROOT_URI': '10.0.230.142', 'AWS_REGION': 'us-east-1', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'NVIDIA_VISIBLE_DEVICES': 'void', 'TRAINING_JOB_NAME': 'iris-job-20201119131155', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'ENVROOT': '/opt/amazon', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'OMP_NUM_THREADS': '2', 'HOSTNAME': 'ip-10-0-230-142.ec2.internal', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/01d241cf-c2d1-4de1-b9a6-8a96a56664a4', 'DMLC_ROLE': 'server', 'PWD': '/', 'DMLC_NUM_SERVER': '1', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-east-1:130636039461:training-job/iris-job-20201119131155', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2'}
[11/19/2020 13:15:38 INFO 139720084928320] Environment: {'ECS_CONTAINER_METADATA_URI': 'http://169.254.170.2/v3/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'ECS_CONTAINER_METADATA_URI_V4': 'http://169.254.170.2/v4/5b076316-bd0e-4444-aa13-a25ae3b4e862', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION': '2', 'DMLC_PS_ROOT_PORT': '9000', 'DMLC_NUM_WORKER': '1', 'SAGEMAKER_HTTP_PORT': '8080', 'PATH': '/opt/amazon/bin:/usr/local/nvidia/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/amazon/bin:/opt/amazon/bin', 'PYTHONUNBUFFERED': 'TRUE', 'CANONICAL_ENVROOT': '/opt/amazon', 'LD_LIBRARY_PATH': '/opt/amazon/lib/python2.7/site-packages/cv2/../../../../lib:/usr/local/nvidia/lib64:/opt/amazon/lib', 'LANG': 'en_US.utf8', 'DMLC_INTERFACE': 'eth0', 'SHLVL': '1', 'DMLC_PS_ROOT_URI': '10.0.230.142', 'AWS_REGION': 'us-east-1', 'SAGEMAKER_METRICS_DIRECTORY': '/opt/ml/output/metrics/sagemaker', 'NVIDIA_VISIBLE_DEVICES': 'void', 'TRAINING_JOB_NAME': 'iris-job-20201119131155', 'HOME': '/root', 'PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION': 'cpp', 'ENVROOT': '/opt/amazon', 'SAGEMAKER_DATA_PATH': '/opt/ml', 'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility', 'NVIDIA_REQUIRE_CUDA': 'cuda>=9.0', 'OMP_NUM_THREADS': '2', 'HOSTNAME': 'ip-10-0-230-142.ec2.internal', 'AWS_CONTAINER_CREDENTIALS_RELATIVE_URI': '/v2/credentials/01d241cf-c2d1-4de1-b9a6-8a96a56664a4', 'DMLC_ROLE': 'worker', 'PWD': '/', 'DMLC_NUM_SERVER': '1', 'TRAINING_JOB_ARN': 'arn:aws:sagemaker:us-east-1:130636039461:training-job/iris-job-20201119131155', 'AWS_EXECUTION_ENV': 'AWS_ECS_EC2'}
Process 60 is a shell:scheduler.
Process 69 is a shell:server.
Process 1 is a worker.
[11/19/2020 13:15:38 INFO 139720084928320] Using default worker.
[11/19/2020 13:15:39 INFO 139720084928320] Checkpoint loading and saving are disabled.
[11/19/2020 13:15:39 INFO 139720084928320] nvidia-smi took: 0.0252599716187 secs to identify 0 gpus
[11/19/2020 13:15:39 INFO 139720084928320] Create Store: dist_async
[11/19/2020 13:15:39 ERROR 139720084928320] nvidia-smi: failed to run (127): /bin/sh: nvidia-smi: command not found
[11/19/2020 13:15:39 INFO 139720084928320] Using per-worker sample size = 105 (Available virtual memory = 15140818944 bytes, GPU free memory = 0 bytes, number of workers = 1). If an out-of-memory error occurs, choose a larger instance type, use dimension reduction, decrease sample_size, and/or decrease mini_batch_size.
#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Number of Batches Since Last Reset": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Number of Records Since Last Reset": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Total Batches Seen": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Total Records Seen": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Max Records Seen Between Resets": {"count": 1, "max": 0, "sum": 0.0, "min": 0}, "Reset Count": {"count": 1, "max": 0, "sum": 0.0, "min": 0}}, "EndTime": 1605791740.078393, "Dimensions": {"Host": "algo-1", "Meta": "init_train_data_iter", "Operation": "training", "Algorithm": "AWS/KNN"}, "StartTime": 1605791740.078323}

[11/19/2020 13:15:40 INFO 139720084928320] push reservoir to kv... 1 num_workers 0 rank
[11/19/2020 13:15:40 INFO 139720084928320] ...done (105)
[11/19/2020 13:15:40 INFO 139720084928320] #progress_metric: host=algo-1, completed 100 % of epochs
#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 1, "sum": 1.0, "min": 1}, "Number of Batches Since Last Reset": {"count": 1, "max": 1, "sum": 1.0, "min": 1}, "Number of Records Since Last Reset": {"count": 1, "max": 105, "sum": 105.0, "min": 105}, "Total Batches Seen": {"count": 1, "max": 1, "sum": 1.0, "min": 1}, "Total Records Seen": {"count": 1, "max": 105, "sum": 105.0, "min": 105}, "Max Records Seen Between Resets": {"count": 1, "max": 105, "sum": 105.0, "min": 105}, "Reset Count": {"count": 1, "max": 1, "sum": 1.0, "min": 1}}, "EndTime": 1605791740.103201, "Dimensions": {"Host": "algo-1", "Meta": "training_data_iter", "Operation": "training", "Algorithm": "AWS/KNN", "epoch": 0}, "StartTime": 1605791740.07875}

[11/19/2020 13:15:40 INFO 139720084928320] #throughput_metric: host=algo-1, train throughput=4267.74994428 records/second
[11/19/2020 13:15:40 INFO 139720084928320] pulled row count... worker 0 rows 105
[11/19/2020 13:15:40 INFO 139720084928320] pulled... worker 0 data (105, 4) labels (105,) nans 0
[11/19/2020 13:15:40 INFO 139720084928320] calling index.train...
[11/19/2020 13:15:40 INFO 139720084928320] ...done calling index.train
[11/19/2020 13:15:40 INFO 139720084928320] calling index.add...
[11/19/2020 13:15:40 INFO 139720084928320] ...done calling index.add
#metrics {"Metrics": {"epochs": {"count": 1, "max": 1, "sum": 1.0, "min": 1}, "model.serialize.time": {"count": 1, "max": 2.8769969940185547, "sum": 2.8769969940185547, "min": 2.8769969940185547}, "finalize.time": {"count": 1, "max": 4.1141510009765625, "sum": 4.1141510009765625, "min": 4.1141510009765625}, "initialize.time": {"count": 1, "max": 866.8332099914551, "sum": 866.8332099914551, "min": 866.8332099914551}, "update.time": {"count": 1, "max": 24.17898178100586, "sum": 24.17898178100586, "min": 24.17898178100586}}, "EndTime": 1605791740.110562, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/KNN"}, "StartTime": 1605791739.211127}

[11/19/2020 13:15:40 INFO 139720084928320] Test data is not provided.
#metrics {"Metrics": {"totaltime": {"count": 1, "max": 1694.9079036712646, "sum": 1694.9079036712646, "min": 1694.9079036712646}, "setuptime": {"count": 1, "max": 37.4448299407959, "sum": 37.4448299407959, "min": 37.4448299407959}}, "EndTime": 1605791740.111786, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "AWS/KNN"}, "StartTime": 1605791740.110652}


2020-11-19 13:15:51 Uploading - Uploading generated training model
2020-11-19 13:15:51 Completed - Training job completed
Training seconds: 88
Billable seconds: 88

Deploy the model on a SageMaker endpoint¶

# Deploy the model to a Sagemaker endpoint
knn_predictor = knn.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')
knn_predictor.serializer = csv_serializer
knn_predictor.deserializer = json_deserializer

-------------------!

Predict species using the model and our test data¶

# Convert the test dataframe into an array and drop the species column on the fly
test_data_array = test_data.drop(['species'], axis=1).values

# Predict using the model and the test data
preds = knn_predictor.predict(test_data_array)

# Predict using the model and one set of flower values
# preds = knn_predictor.predict([4.8, 3.0, 1.4, 0.1])

# Convert JSON predictions to an array
preds_array = np.array([preds['predictions'][i]['predicted_label'] for i in range(len(preds['predictions']))])

# Convert the array to a dataframe with a single column called pred
preds_df = pd.DataFrame(preds_array, columns=['prediction'])

# Add the pred dataframe to the test dataframe by simply placing side by side
combined_df = test_data.reset_index(drop=True).join(preds_df)

# Remap species integers back to original values
combined_df['species'] = combined_df['species'].replace({0: 'Iris-setosa', 1: 'Iris-versicolor', 2: 'Iris-virginica'})
combined_df['prediction'] = combined_df['prediction'].replace({0: 'Iris-setosa', 1: 'Iris-versicolor', 2: 'Iris-virginica'})

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.

Review the predictions¶

# As with the previous model above, predictions look good compared to the known class
combined_df.head(20)

# Prediction of species is largely correct compared to the observed species
pd.crosstab(combined_df['species'], combined_df['prediction'], rownames=['Actual species'], colnames=['Predicted species'])

# Visually, the predictions show what we would expect when compared with the training data pairplot higher up above
sns.pairplot(combined_df, hue='prediction', markers='+')
plt.show()

Delete the endpoint (to avoid running up a big bill)¶

sagemaker.Session().delete_endpoint(knn_predictor.endpoint)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.054000	3.758667	1.198667
std	0.828066	0.433594	1.764420	0.763161
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

	sepal_length	sepal_width	petal_length	petal_width
species
Iris-setosa	5.006	3.418	1.464	0.244
Iris-versicolor	5.936	2.770	4.260	1.326
Iris-virginica	6.588	2.974	5.552	2.026

Predicted species	Iris-setosa	Iris-versicolor	Iris-virginica
Actual species
Iris-setosa	17	0	0
Iris-versicolor	0	15	0
Iris-virginica	0	1	12

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepal_length	sepal_width	petal_length	petal_width	species	prediction
0	5.7	3.0	4.2	1.2	Iris-versicolor	Iris-versicolor
1	4.9	3.1	1.5	0.1	Iris-setosa	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa	Iris-setosa
3	4.6	3.4	1.4	0.3	Iris-setosa	Iris-setosa
4	6.4	2.8	5.6	2.1	Iris-virginica	Iris-virginica
5	4.8	3.4	1.9	0.2	Iris-setosa	Iris-setosa
6	5.7	2.9	4.2	1.3	Iris-versicolor	Iris-versicolor
7	5.4	3.9	1.3	0.4	Iris-setosa	Iris-setosa
8	5.0	3.4	1.6	0.4	Iris-setosa	Iris-setosa
9	6.6	3.0	4.4	1.4	Iris-versicolor	Iris-versicolor
10	7.2	3.0	5.8	1.6	Iris-virginica	Iris-virginica
11	5.4	3.4	1.7	0.2	Iris-setosa	Iris-setosa
12	5.4	3.7	1.5	0.2	Iris-setosa	Iris-setosa
13	6.5	2.8	4.6	1.5	Iris-versicolor	Iris-versicolor
14	6.4	3.2	4.5	1.5	Iris-versicolor	Iris-versicolor
15	5.1	3.8	1.9	0.4	Iris-setosa	Iris-setosa
16	7.3	2.9	6.3	1.8	Iris-virginica	Iris-virginica
17	6.4	2.9	4.3	1.3	Iris-versicolor	Iris-versicolor
18	6.2	2.9	4.3	1.3	Iris-versicolor	Iris-versicolor
19	5.1	3.5	1.4	0.3	Iris-setosa	Iris-setosa

	species	sepal_length	sepal_width	petal_length	petal_width	prediction
0	Iris-setosa	5.0	3.5	1.3	0.3	Iris-setosa
1	Iris-virginica	6.4	2.7	5.3	1.9	Iris-virginica
2	Iris-versicolor	6.0	2.9	4.5	1.5	Iris-versicolor
3	Iris-setosa	5.0	3.2	1.2	0.2	Iris-setosa
4	Iris-versicolor	5.6	2.7	4.2	1.3	Iris-versicolor
5	Iris-setosa	5.0	3.4	1.5	0.2	Iris-setosa
6	Iris-setosa	5.3	3.7	1.5	0.2	Iris-setosa
7	Iris-setosa	5.8	4.0	1.2	0.2	Iris-setosa
8	Iris-virginica	6.3	2.7	4.9	1.8	Iris-virginica
9	Iris-virginica	6.4	3.1	5.5	1.8	Iris-virginica
10	Iris-versicolor	5.0	2.0	3.5	1.0	Iris-versicolor
11	Iris-virginica	7.4	2.8	6.1	1.9	Iris-virginica
12	Iris-versicolor	6.1	2.9	4.7	1.4	Iris-versicolor
13	Iris-setosa	5.0	3.4	1.6	0.4	Iris-setosa
14	Iris-virginica	6.1	3.0	4.9	1.8	Iris-virginica
15	Iris-setosa	5.4	3.9	1.7	0.4	Iris-setosa
16	Iris-virginica	6.2	2.8	4.8	1.8	Iris-virginica
17	Iris-virginica	6.7	2.5	5.8	1.8	Iris-virginica
18	Iris-versicolor	6.2	2.2	4.5	1.5	Iris-versicolor
19	Iris-versicolor	5.9	3.2	4.8	1.8	Iris-versicolor