classify module¶
Classification module.
Gaussian_Naive_Bayes
¶
A class to implement Gaussian Naive Bayes classifier for training, tuning, and classification tasks.
Source code in geonate/classify.py
class Gaussian_Naive_Bayes:
"""
A class to implement Gaussian Naive Bayes classifier for training, tuning, and classification tasks.
"""
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initialize the Gaussian_Naive_Bayes class with training and testing data.
Args:
X_train (array-like): Training feature data.
y_train (array-like): Training target data.
X_test (array-like): Testing feature data.
y_test (array-like): Testing target data.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_gnb = None
self.tuned_gnb = None
# Automatically run the initial model
self.model()
# Initial model and validation
def model(self, **kwargs):
"""
Train and validate the initial Gaussian Naive Bayes model.
Args:
**kwargs: Additional keyword arguments for GaussianNB.
Returns:
GaussianNB: The trained Gaussian Naive Bayes model.
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
gnb = GaussianNB(**kwargs)
gnb.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = gnb.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_gnb = gnb
return self.initial_gnb
# Tune the best parameters for classifier using random search or grid search methods
def tune(self, method="random", var_smoothing=[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0], n_iter=5, cv=5, n_job=-1):
"""
Tune the Gaussian Naive Bayes model using random search or grid search methods.
Args:
method (str): The tuning method to use ('random' or 'grid').
var_smoothing (list): List of var_smoothing values to try.
n_iter (int): Number of iterations for random search.
cv (int): Number of cross-validation folds.
n_job (int): Number of jobs to run in parallel.
Returns:
The tuned Gaussian Naive Bayes model.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'var_smoothing': var_smoothing
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_gnb, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_gnb = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_gnb, param_grid= paras, cv=cv, scoring='accuracy', verbose=True, n_jobs=-1)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_gnb = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_gnb
# Classify image
def classify(self, src, model=None):
"""
Classify an image using the Gaussian Naive Bayes model.
Args:
src (rasterio.DatasetReader): The source image to classify.
model (GaussianNB, optional): The model to use for classification. If None, the tuned model or initial model will be used.
Returns:
The classified image.
"""
import rasterio
from geonate.common import reshape_raster, array2raster
# Define the random forest model to use
if model is not None:
GNB_model = model
else:
GNB_model = self.tuned_gnb if self.tuned_gnb is not None else self.initial_gnb
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the defined model
pred_labels = GNB_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
__init__(self, X_train, y_train, X_test, y_test)
special
¶
Initialize the Gaussian_Naive_Bayes class with training and testing data.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X_train |
array-like |
Training feature data. |
required |
y_train |
array-like |
Training target data. |
required |
X_test |
array-like |
Testing feature data. |
required |
y_test |
array-like |
Testing target data. |
required |
Source code in geonate/classify.py
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initialize the Gaussian_Naive_Bayes class with training and testing data.
Args:
X_train (array-like): Training feature data.
y_train (array-like): Training target data.
X_test (array-like): Testing feature data.
y_test (array-like): Testing target data.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_gnb = None
self.tuned_gnb = None
# Automatically run the initial model
self.model()
classify(self, src, model=None)
¶
Classify an image using the Gaussian Naive Bayes model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
src |
rasterio.DatasetReader |
The source image to classify. |
required |
model |
GaussianNB |
The model to use for classification. If None, the tuned model or initial model will be used. |
None |
Returns:
| Type | Description |
|---|---|
The classified image. |
Source code in geonate/classify.py
def classify(self, src, model=None):
"""
Classify an image using the Gaussian Naive Bayes model.
Args:
src (rasterio.DatasetReader): The source image to classify.
model (GaussianNB, optional): The model to use for classification. If None, the tuned model or initial model will be used.
Returns:
The classified image.
"""
import rasterio
from geonate.common import reshape_raster, array2raster
# Define the random forest model to use
if model is not None:
GNB_model = model
else:
GNB_model = self.tuned_gnb if self.tuned_gnb is not None else self.initial_gnb
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the defined model
pred_labels = GNB_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
model(self, **kwargs)
¶
Train and validate the initial Gaussian Naive Bayes model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
**kwargs |
Additional keyword arguments for GaussianNB. |
{} |
Returns:
| Type | Description |
|---|---|
GaussianNB |
The trained Gaussian Naive Bayes model. |
Source code in geonate/classify.py
def model(self, **kwargs):
"""
Train and validate the initial Gaussian Naive Bayes model.
Args:
**kwargs: Additional keyword arguments for GaussianNB.
Returns:
GaussianNB: The trained Gaussian Naive Bayes model.
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
gnb = GaussianNB(**kwargs)
gnb.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = gnb.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_gnb = gnb
return self.initial_gnb
tune(self, method='random', var_smoothing=[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0], n_iter=5, cv=5, n_job=-1)
¶
Tune the Gaussian Naive Bayes model using random search or grid search methods.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
method |
str |
The tuning method to use ('random' or 'grid'). |
'random' |
var_smoothing |
list |
List of var_smoothing values to try. |
[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0] |
n_iter |
int |
Number of iterations for random search. |
5 |
cv |
int |
Number of cross-validation folds. |
5 |
n_job |
int |
Number of jobs to run in parallel. |
-1 |
Returns:
| Type | Description |
|---|---|
The tuned Gaussian Naive Bayes model. |
Source code in geonate/classify.py
def tune(self, method="random", var_smoothing=[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0], n_iter=5, cv=5, n_job=-1):
"""
Tune the Gaussian Naive Bayes model using random search or grid search methods.
Args:
method (str): The tuning method to use ('random' or 'grid').
var_smoothing (list): List of var_smoothing values to try.
n_iter (int): Number of iterations for random search.
cv (int): Number of cross-validation folds.
n_job (int): Number of jobs to run in parallel.
Returns:
The tuned Gaussian Naive Bayes model.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'var_smoothing': var_smoothing
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_gnb, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_gnb = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_gnb, param_grid= paras, cv=cv, scoring='accuracy', verbose=True, n_jobs=-1)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_gnb = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_gnb
KNN
¶
A class that encapsulates a K-Nearest Neighbors (KNN) model for classification tasks, including model training, hyperparameter tuning using grid or random search, and classification of image data.
Attributes:
| Name | Type | Description |
|---|---|---|
X_train |
ndarray or DataFrame |
The training features for model fitting. |
y_train |
ndarray or Series |
The training labels for model fitting. |
X_test |
ndarray or DataFrame |
The test features for model validation. |
y_test |
ndarray or Series |
The test labels for model validation. |
initial_knn |
KNeighborsClassifier |
The initial KNN model (untuned). |
tuned_knn |
KNeighborsClassifier |
The KNN model after tuning using grid or random search. |
accuracy |
float |
Accuracy of the initial (naive) KNN model. |
confusion_matrix |
ndarray |
Confusion matrix of the initial (naive) KNN model. |
confusion_matrix_percent |
ndarray |
Percent-based confusion matrix for the initial (naive) model. |
tuned_accuracy |
float |
Accuracy of the tuned KNN model. |
tuned_confusion_matrix |
ndarray |
Confusion matrix of the tuned KNN model. |
tuned_confusion_matrix_percent |
ndarray |
Percent-based confusion matrix for the tuned model. |
Source code in geonate/classify.py
class KNN:
"""
A class that encapsulates a K-Nearest Neighbors (KNN) model for classification tasks, including model training,
hyperparameter tuning using grid or random search, and classification of image data.
Attributes:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
initial_knn (KNeighborsClassifier, optional): The initial KNN model (untuned).
tuned_knn (KNeighborsClassifier, optional): The KNN model after tuning using grid or random search.
accuracy (float, optional): Accuracy of the initial (naive) KNN model.
confusion_matrix (ndarray, optional): Confusion matrix of the initial (naive) KNN model.
confusion_matrix_percent (ndarray, optional): Percent-based confusion matrix for the initial (naive) model.
tuned_accuracy (float, optional): Accuracy of the tuned KNN model.
tuned_confusion_matrix (ndarray, optional): Confusion matrix of the tuned KNN model.
tuned_confusion_matrix_percent (ndarray, optional): Percent-based confusion matrix for the tuned model.
"""
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the KNN class with the provided training and testing data.
Args:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_knn = None
self.tuned_knn = None
# Automatically run the initial model
self.model()
# Initial model and validation
def model(self, **kwargs):
"""
Trains and validates the initial KNN model.
Args:
**kwargs: Additional keyword arguments for the KNeighborsClassifier model.
Returns:
KNeighborsClassifier: The trained KNN model.
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
knn = KNeighborsClassifier(**kwargs)
knn.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = knn.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_knn = knn
return self.initial_knn
# Tune the best parameters for classifier using random search or grid search methods
def tune(self, method="random", n_neighbors=[3, 5, 7, 9, 11], weights= ['uniform', 'distance'],n_iter=5, cv=5, n_job=-1):
"""
Tunes the best parameters for the KNN classifier using random search or grid search methods.
Args:
method (str): The tuning method to use ('random' or 'grid'). Default is 'random'.
n_neighbors (list): List of values for the number of neighbors to use. Default is [3, 5, 7, 9, 11].
weights (list): List of weight functions used in prediction. Default is ['uniform', 'distance'].
n_iter (int): Number of parameter settings that are sampled in random search. Default is 5.
cv (int): Number of cross-validation folds. Default is 5.
n_jobs (int): Number of jobs to run in parallel. Default is -1 (use all processors).
Returns:
KNeighborsClassifier: The tuned KNN model.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'n_neighbors': n_neighbors,
'weights': weights
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_knn, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_knn = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_knn, param_grid= paras, cv=cv, verbose=True, n_jobs=-1, scoring='accuracy')
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_knn = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_knn
# Classify image
def classify(self, src, model=None):
"""
Classifies an image using the trained or tuned KNN model.
Args:
src (rasterio.DatasetReader): A rasterio object representing the image to classify.
model (KNeighborsClassifier, optional): Trained KNN model to classify the image. If None, uses the tuned KNN model if available, otherwise the naive KNN model.
Returns:
rasterio.DatasetReader: The classified image as a raster object.
"""
import rasterio
from geonate.common import reshape_raster, array2raster
# Define the model to use
if model is not None:
KNN_model = model
else:
KNN_model = self.tuned_knn if self.tuned_knn is not None else self.initial_knn
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the define model
pred_labels = KNN_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
__init__(self, X_train, y_train, X_test, y_test)
special
¶
Initializes the KNN class with the provided training and testing data.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X_train |
ndarray or DataFrame |
The training features for model fitting. |
required |
y_train |
ndarray or Series |
The training labels for model fitting. |
required |
X_test |
ndarray or DataFrame |
The test features for model validation. |
required |
y_test |
ndarray or Series |
The test labels for model validation. |
required |
Source code in geonate/classify.py
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the KNN class with the provided training and testing data.
Args:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_knn = None
self.tuned_knn = None
# Automatically run the initial model
self.model()
classify(self, src, model=None)
¶
Classifies an image using the trained or tuned KNN model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
src |
rasterio.DatasetReader |
A rasterio object representing the image to classify. |
required |
model |
KNeighborsClassifier |
Trained KNN model to classify the image. If None, uses the tuned KNN model if available, otherwise the naive KNN model. |
None |
Returns:
| Type | Description |
|---|---|
rasterio.DatasetReader |
The classified image as a raster object. |
Source code in geonate/classify.py
def classify(self, src, model=None):
"""
Classifies an image using the trained or tuned KNN model.
Args:
src (rasterio.DatasetReader): A rasterio object representing the image to classify.
model (KNeighborsClassifier, optional): Trained KNN model to classify the image. If None, uses the tuned KNN model if available, otherwise the naive KNN model.
Returns:
rasterio.DatasetReader: The classified image as a raster object.
"""
import rasterio
from geonate.common import reshape_raster, array2raster
# Define the model to use
if model is not None:
KNN_model = model
else:
KNN_model = self.tuned_knn if self.tuned_knn is not None else self.initial_knn
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the define model
pred_labels = KNN_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
model(self, **kwargs)
¶
Trains and validates the initial KNN model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
**kwargs |
Additional keyword arguments for the KNeighborsClassifier model. |
{} |
Returns:
| Type | Description |
|---|---|
KNeighborsClassifier |
The trained KNN model. |
Source code in geonate/classify.py
def model(self, **kwargs):
"""
Trains and validates the initial KNN model.
Args:
**kwargs: Additional keyword arguments for the KNeighborsClassifier model.
Returns:
KNeighborsClassifier: The trained KNN model.
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
knn = KNeighborsClassifier(**kwargs)
knn.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = knn.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_knn = knn
return self.initial_knn
tune(self, method='random', n_neighbors=[3, 5, 7, 9, 11], weights=['uniform', 'distance'], n_iter=5, cv=5, n_job=-1)
¶
Tunes the best parameters for the KNN classifier using random search or grid search methods.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
method |
str |
The tuning method to use ('random' or 'grid'). Default is 'random'. |
'random' |
n_neighbors |
list |
List of values for the number of neighbors to use. Default is [3, 5, 7, 9, 11]. |
[3, 5, 7, 9, 11] |
weights |
list |
List of weight functions used in prediction. Default is ['uniform', 'distance']. |
['uniform', 'distance'] |
n_iter |
int |
Number of parameter settings that are sampled in random search. Default is 5. |
5 |
cv |
int |
Number of cross-validation folds. Default is 5. |
5 |
n_jobs |
int |
Number of jobs to run in parallel. Default is -1 (use all processors). |
required |
Returns:
| Type | Description |
|---|---|
KNeighborsClassifier |
The tuned KNN model. |
Source code in geonate/classify.py
def tune(self, method="random", n_neighbors=[3, 5, 7, 9, 11], weights= ['uniform', 'distance'],n_iter=5, cv=5, n_job=-1):
"""
Tunes the best parameters for the KNN classifier using random search or grid search methods.
Args:
method (str): The tuning method to use ('random' or 'grid'). Default is 'random'.
n_neighbors (list): List of values for the number of neighbors to use. Default is [3, 5, 7, 9, 11].
weights (list): List of weight functions used in prediction. Default is ['uniform', 'distance'].
n_iter (int): Number of parameter settings that are sampled in random search. Default is 5.
cv (int): Number of cross-validation folds. Default is 5.
n_jobs (int): Number of jobs to run in parallel. Default is -1 (use all processors).
Returns:
KNeighborsClassifier: The tuned KNN model.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'n_neighbors': n_neighbors,
'weights': weights
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_knn, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_knn = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_knn, param_grid= paras, cv=cv, verbose=True, n_jobs=-1, scoring='accuracy')
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_knn = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_knn
RandomForest
¶
A class that encapsulates a Random Forest model for classification tasks, including model training, hyperparameter tuning using grid or random search, and classification of image data.
Attributes:
| Name | Type | Description |
|---|---|---|
X_train |
ndarray or DataFrame |
The training features for model fitting. |
y_train |
ndarray or Series |
The training labels for model fitting. |
X_test |
ndarray or DataFrame |
The test features for model validation. |
y_test |
ndarray or Series |
The test labels for model validation. |
initial_rf |
RandomForestClassifier |
The initial Random Forest model (untuned). |
tuned_rf |
RandomForestClassifier |
The Random Forest model after tuning using grid or random search. |
accuracy |
float |
Accuracy of the initial (naive) Random Forest model. |
confusion_matrix |
ndarray |
Confusion matrix of the initial (naive) Random Forest model. |
confusion_matrix_percent |
ndarray |
Percent-based confusion matrix for the initial (naive) model. |
tuned_accuracy |
float |
Accuracy of the tuned Random Forest model. |
tuned_confusion_matrix |
ndarray |
Confusion matrix of the tuned Random Forest model. |
tuned_confusion_matrix_percent |
ndarray |
Percent-based confusion matrix for the tuned model. |
Source code in geonate/classify.py
class RandomForest:
"""
A class that encapsulates a Random Forest model for classification tasks, including model training,
hyperparameter tuning using grid or random search, and classification of image data.
Attributes:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
initial_rf (RandomForestClassifier, optional): The initial Random Forest model (untuned).
tuned_rf (RandomForestClassifier, optional): The Random Forest model after tuning using grid or random search.
accuracy (float, optional): Accuracy of the initial (naive) Random Forest model.
confusion_matrix (ndarray, optional): Confusion matrix of the initial (naive) Random Forest model.
confusion_matrix_percent (ndarray, optional): Percent-based confusion matrix for the initial (naive) model.
tuned_accuracy (float, optional): Accuracy of the tuned Random Forest model.
tuned_confusion_matrix (ndarray, optional): Confusion matrix of the tuned Random Forest model.
tuned_confusion_matrix_percent (ndarray, optional): Percent-based confusion matrix for the tuned model.
"""
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the RandomForest class with the provided training and testing data.
Args:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_rf = None
self.tuned_rf = None
# Automatically run the initial model
self.model()
# Initial model and validation
def model(self, n_estimators=100,**kwargs):
"""
Trains a random forest classifier with the provided hyperparameters and validates it.
Args:
n_estimators (int, optional): The number of trees in the forest. Default is 100.
**kwargs: Additional keyword arguments passed to the RandomForestClassifier.
Returns:
RandomForestClassifier: The trained (naive) random forest classifier.
"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
rf = RandomForestClassifier(n_estimators= n_estimators, **kwargs)
rf.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = rf.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_rf = rf
return self.initial_rf
# Tune the best parameters for classifier using random search or grid search methods
def tune(self, method="random", n_estimators=[100, 200, 300, 500, 1000], max_depth=[None, 10, 20, 30, 50], min_samples_split=[2, 5, 10, 20], min_samples_leaf=[1, 2, 3, 5], max_features= ['sqrt'], n_iter=5, cv=5, n_job=-1):
"""
Tunes the Random Forest model's hyperparameters using grid or random search.
Args:
method (str, optional): The method used for hyperparameter search. Can be 'random' or 'grid'. Default is 'random'.
n_estimators (list, optional): List of values for the number of trees to search over. Default is [100, 200, 300, 500, 1000].
max_depth (list, optional): List of values for the maximum depth of trees. Default is [None].
min_samples_split (list, optional): List of values for the minimum number of samples required to split an internal node. Default is [2, 5, 10, 20].
min_samples_leaf (list, optional): List of values for the minimum number of samples required to be at a leaf node. Default is [1, 2, 3, 5].
max_features (list, optional): List of values for the number of features to consider when looking for the best split. Default is ['sqrt'].
n_iter (int, optional): The number of iterations for RandomizedSearchCV. Default is 3.
cv (int, optional): Cross-validation generator or an iterable. Default is 3.
n_jobs (int, optional): The number of jobs to run in parallel. Default is -1 (use all processors).
Returns:
RandomForestClassifier: The tuned Random Forest classifier.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'n_estimators': n_estimators,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_features': max_features
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_rf, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_rf = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_rf, param_grid= paras, cv=cv, scoring='accuracy', verbose=True, n_jobs=-1)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_rf = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tunded_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tunded_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tunded_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tunded_y_pred)
return self.tuned_rf
# Classify image
def classify(self, src, model=None):
"""
Classifies an image using the trained or tuned Random Forest model.
Args:
src (rasterio.DatasetReader): A rasterio object representing the image to classify.
model (object): trained Random forest model to classify image.
Returns:
rasterio.DatasetReader: The classified image as a raster object.
"""
import rasterio
from .common import reshape_raster, array2raster
# Define the model to use
if model is not None:
RF_model = model
else:
RF_model = self.tuned_rf if self.tuned_rf is not None else self.initial_rf
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the define model
pred_labels = RF_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
__init__(self, X_train, y_train, X_test, y_test)
special
¶
Initializes the RandomForest class with the provided training and testing data.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X_train |
ndarray or DataFrame |
The training features for model fitting. |
required |
y_train |
ndarray or Series |
The training labels for model fitting. |
required |
X_test |
ndarray or DataFrame |
The test features for model validation. |
required |
y_test |
ndarray or Series |
The test labels for model validation. |
required |
Source code in geonate/classify.py
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the RandomForest class with the provided training and testing data.
Args:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_rf = None
self.tuned_rf = None
# Automatically run the initial model
self.model()
classify(self, src, model=None)
¶
Classifies an image using the trained or tuned Random Forest model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
src |
rasterio.DatasetReader |
A rasterio object representing the image to classify. |
required |
model |
object |
trained Random forest model to classify image. |
None |
Returns:
| Type | Description |
|---|---|
rasterio.DatasetReader |
The classified image as a raster object. |
Source code in geonate/classify.py
def classify(self, src, model=None):
"""
Classifies an image using the trained or tuned Random Forest model.
Args:
src (rasterio.DatasetReader): A rasterio object representing the image to classify.
model (object): trained Random forest model to classify image.
Returns:
rasterio.DatasetReader: The classified image as a raster object.
"""
import rasterio
from .common import reshape_raster, array2raster
# Define the model to use
if model is not None:
RF_model = model
else:
RF_model = self.tuned_rf if self.tuned_rf is not None else self.initial_rf
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the define model
pred_labels = RF_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
model(self, n_estimators=100, **kwargs)
¶
Trains a random forest classifier with the provided hyperparameters and validates it.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
n_estimators |
int |
The number of trees in the forest. Default is 100. |
100 |
**kwargs |
Additional keyword arguments passed to the RandomForestClassifier. |
{} |
Returns:
| Type | Description |
|---|---|
RandomForestClassifier |
The trained (naive) random forest classifier. |
Source code in geonate/classify.py
def model(self, n_estimators=100,**kwargs):
"""
Trains a random forest classifier with the provided hyperparameters and validates it.
Args:
n_estimators (int, optional): The number of trees in the forest. Default is 100.
**kwargs: Additional keyword arguments passed to the RandomForestClassifier.
Returns:
RandomForestClassifier: The trained (naive) random forest classifier.
"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
rf = RandomForestClassifier(n_estimators= n_estimators, **kwargs)
rf.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = rf.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_rf = rf
return self.initial_rf
tune(self, method='random', n_estimators=[100, 200, 300, 500, 1000], max_depth=[None, 10, 20, 30, 50], min_samples_split=[2, 5, 10, 20], min_samples_leaf=[1, 2, 3, 5], max_features=['sqrt'], n_iter=5, cv=5, n_job=-1)
¶
Tunes the Random Forest model's hyperparameters using grid or random search.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
method |
str |
The method used for hyperparameter search. Can be 'random' or 'grid'. Default is 'random'. |
'random' |
n_estimators |
list |
List of values for the number of trees to search over. Default is [100, 200, 300, 500, 1000]. |
[100, 200, 300, 500, 1000] |
max_depth |
list |
List of values for the maximum depth of trees. Default is [None]. |
[None, 10, 20, 30, 50] |
min_samples_split |
list |
List of values for the minimum number of samples required to split an internal node. Default is [2, 5, 10, 20]. |
[2, 5, 10, 20] |
min_samples_leaf |
list |
List of values for the minimum number of samples required to be at a leaf node. Default is [1, 2, 3, 5]. |
[1, 2, 3, 5] |
max_features |
list |
List of values for the number of features to consider when looking for the best split. Default is ['sqrt']. |
['sqrt'] |
n_iter |
int |
The number of iterations for RandomizedSearchCV. Default is 3. |
5 |
cv |
int |
Cross-validation generator or an iterable. Default is 3. |
5 |
n_jobs |
int |
The number of jobs to run in parallel. Default is -1 (use all processors). |
required |
Returns:
| Type | Description |
|---|---|
RandomForestClassifier |
The tuned Random Forest classifier. |
Source code in geonate/classify.py
def tune(self, method="random", n_estimators=[100, 200, 300, 500, 1000], max_depth=[None, 10, 20, 30, 50], min_samples_split=[2, 5, 10, 20], min_samples_leaf=[1, 2, 3, 5], max_features= ['sqrt'], n_iter=5, cv=5, n_job=-1):
"""
Tunes the Random Forest model's hyperparameters using grid or random search.
Args:
method (str, optional): The method used for hyperparameter search. Can be 'random' or 'grid'. Default is 'random'.
n_estimators (list, optional): List of values for the number of trees to search over. Default is [100, 200, 300, 500, 1000].
max_depth (list, optional): List of values for the maximum depth of trees. Default is [None].
min_samples_split (list, optional): List of values for the minimum number of samples required to split an internal node. Default is [2, 5, 10, 20].
min_samples_leaf (list, optional): List of values for the minimum number of samples required to be at a leaf node. Default is [1, 2, 3, 5].
max_features (list, optional): List of values for the number of features to consider when looking for the best split. Default is ['sqrt'].
n_iter (int, optional): The number of iterations for RandomizedSearchCV. Default is 3.
cv (int, optional): Cross-validation generator or an iterable. Default is 3.
n_jobs (int, optional): The number of jobs to run in parallel. Default is -1 (use all processors).
Returns:
RandomForestClassifier: The tuned Random Forest classifier.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'n_estimators': n_estimators,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'max_features': max_features
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_rf, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_rf = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_rf, param_grid= paras, cv=cv, scoring='accuracy', verbose=True, n_jobs=-1)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_rf = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tunded_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tunded_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tunded_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tunded_y_pred)
return self.tuned_rf
SVM
¶
A class that encapsulates a Support Vector Machine (SVM) model for classification tasks, including model training, hyperparameter tuning using grid or random search, and classification of image data.
Attributes:
| Name | Type | Description |
|---|---|---|
X_train |
ndarray or DataFrame |
The training features for model fitting. |
y_train |
ndarray or Series |
The training labels for model fitting. |
X_test |
ndarray or DataFrame |
The test features for model validation. |
y_test |
ndarray or Series |
The test labels for model validation. |
initial_svm |
SVC |
The initial SVM model (untuned). |
tuned_svm |
SVC |
The SVM model after tuning using grid or random search. |
accuracy |
float |
Accuracy of the initial (naive) SVM model. |
confusion_matrix |
ndarray |
Confusion matrix of the initial (naive) SVM model. |
confusion_matrix_percent |
ndarray |
Percent-based confusion matrix for the initial (naive) model. |
tuned_accuracy |
float |
Accuracy of the tuned SVM model. |
tuned_confusion_matrix |
ndarray |
Confusion matrix of the tuned SVM model. |
tuned_confusion_matrix_percent |
ndarray |
Percent-based confusion matrix for the tuned model. |
Source code in geonate/classify.py
class SVM:
"""
A class that encapsulates a Support Vector Machine (SVM) model for classification tasks, including model training,
hyperparameter tuning using grid or random search, and classification of image data.
Attributes:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
initial_svm (SVC, optional): The initial SVM model (untuned).
tuned_svm (SVC, optional): The SVM model after tuning using grid or random search.
accuracy (float, optional): Accuracy of the initial (naive) SVM model.
confusion_matrix (ndarray, optional): Confusion matrix of the initial (naive) SVM model.
confusion_matrix_percent (ndarray, optional): Percent-based confusion matrix for the initial (naive) model.
tuned_accuracy (float, optional): Accuracy of the tuned SVM model.
tuned_confusion_matrix (ndarray, optional): Confusion matrix of the tuned SVM model.
tuned_confusion_matrix_percent (ndarray, optional): Percent-based confusion matrix for the tuned model.
"""
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the SVM class with the provided training and testing data.
Args:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_svm = None
self.tuned_svm = None
# Automatically run the initial model
self.model()
# Initial model and validation
def model(self, kernel='rbf',**kwargs):
"""
Trains and validates the initial SVM model.
Args:
kernel (str): Specifies the kernel type to be used in the algorithm. Default is 'rbf'.
**kwargs: Additional keyword arguments for the SVC model.
Returns:
SVC: The trained SVM model.
"""
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
svm = SVC(kernel=kernel, **kwargs)
svm.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = svm.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_svm = svm
return self.initial_svm
# Tune the best parameters for classifier using random search or grid search methods
def tune(self, method="random", kernel=['rbf'], C=[1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000], gamma=[1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0], n_iter=5, cv=5, n_job=-1):
"""
Tunes the best parameters for the SVM classifier using random search or grid search methods.
Args:
method (str): The tuning method to use ('random' or 'grid'). Default is 'random'.
kernel (list): List of kernel types to be used in the algorithm. Default is ['rbf'].
C (list): List of regularization parameters. Default is [1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000].
gamma (list): List of kernel coefficient values. Default is [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0].
n_iter (int): Number of parameter settings that are sampled in random search. Default is 3.
cv (int): Number of cross-validation folds. Default is 5.
n_job (int): Number of jobs to run in parallel. Default is -1 (use all processors).
Returns:
SVC: The tuned SVM model.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'kernel': kernel,
'C': C,
'gamma': gamma
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_svm, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_svm = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_svm, param_grid= paras, cv=cv, n_jobs=-1, scoring='accuracy', verbose=True)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_svm = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_svm
# Classify image
def classify(self, src, model=None):
"""
Classifies an image using the trained or tuned SVM model.
Args:
src (rasterio.DatasetReader): A rasterio object representing the image to classify.
model (SVC, optional): Trained SVM model to classify the image. If None, uses the tuned SVM model if available, otherwise the naive SVM model.
Returns:
rasterio.DatasetReader: The classified image as a raster object.
"""
import rasterio
from .common import reshape_raster, array2raster
# Define the model to use
if model is not None:
SVM_model = model
else:
SVM_model = self.tuned_svm if self.tuned_svm is not None else self.initial_svm
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the define model
pred_labels = SVM_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
__init__(self, X_train, y_train, X_test, y_test)
special
¶
Initializes the SVM class with the provided training and testing data.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X_train |
ndarray or DataFrame |
The training features for model fitting. |
required |
y_train |
ndarray or Series |
The training labels for model fitting. |
required |
X_test |
ndarray or DataFrame |
The test features for model validation. |
required |
y_test |
ndarray or Series |
The test labels for model validation. |
required |
Source code in geonate/classify.py
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the SVM class with the provided training and testing data.
Args:
X_train (ndarray or DataFrame): The training features for model fitting.
y_train (ndarray or Series): The training labels for model fitting.
X_test (ndarray or DataFrame): The test features for model validation.
y_test (ndarray or Series): The test labels for model validation.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_svm = None
self.tuned_svm = None
# Automatically run the initial model
self.model()
classify(self, src, model=None)
¶
Classifies an image using the trained or tuned SVM model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
src |
rasterio.DatasetReader |
A rasterio object representing the image to classify. |
required |
model |
SVC |
Trained SVM model to classify the image. If None, uses the tuned SVM model if available, otherwise the naive SVM model. |
None |
Returns:
| Type | Description |
|---|---|
rasterio.DatasetReader |
The classified image as a raster object. |
Source code in geonate/classify.py
def classify(self, src, model=None):
"""
Classifies an image using the trained or tuned SVM model.
Args:
src (rasterio.DatasetReader): A rasterio object representing the image to classify.
model (SVC, optional): Trained SVM model to classify the image. If None, uses the tuned SVM model if available, otherwise the naive SVM model.
Returns:
rasterio.DatasetReader: The classified image as a raster object.
"""
import rasterio
from .common import reshape_raster, array2raster
# Define the model to use
if model is not None:
SVM_model = model
else:
SVM_model = self.tuned_svm if self.tuned_svm is not None else self.initial_svm
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the define model
pred_labels = SVM_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
model(self, kernel='rbf', **kwargs)
¶
Trains and validates the initial SVM model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
kernel |
str |
Specifies the kernel type to be used in the algorithm. Default is 'rbf'. |
'rbf' |
**kwargs |
Additional keyword arguments for the SVC model. |
{} |
Returns:
| Type | Description |
|---|---|
SVC |
The trained SVM model. |
Source code in geonate/classify.py
def model(self, kernel='rbf',**kwargs):
"""
Trains and validates the initial SVM model.
Args:
kernel (str): Specifies the kernel type to be used in the algorithm. Default is 'rbf'.
**kwargs: Additional keyword arguments for the SVC model.
Returns:
SVC: The trained SVM model.
"""
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
svm = SVC(kernel=kernel, **kwargs)
svm.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = svm.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_svm = svm
return self.initial_svm
tune(self, method='random', kernel=['rbf'], C=[1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000], gamma=[1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0], n_iter=5, cv=5, n_job=-1)
¶
Tunes the best parameters for the SVM classifier using random search or grid search methods.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
method |
str |
The tuning method to use ('random' or 'grid'). Default is 'random'. |
'random' |
kernel |
list |
List of kernel types to be used in the algorithm. Default is ['rbf']. |
['rbf'] |
C |
list |
List of regularization parameters. Default is [1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000]. |
[1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000] |
gamma |
list |
List of kernel coefficient values. Default is [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0]. |
[1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1.0] |
n_iter |
int |
Number of parameter settings that are sampled in random search. Default is 3. |
5 |
cv |
int |
Number of cross-validation folds. Default is 5. |
5 |
n_job |
int |
Number of jobs to run in parallel. Default is -1 (use all processors). |
-1 |
Returns:
| Type | Description |
|---|---|
SVC |
The tuned SVM model. |
Source code in geonate/classify.py
def tune(self, method="random", kernel=['rbf'], C=[1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000], gamma=[1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0], n_iter=5, cv=5, n_job=-1):
"""
Tunes the best parameters for the SVM classifier using random search or grid search methods.
Args:
method (str): The tuning method to use ('random' or 'grid'). Default is 'random'.
kernel (list): List of kernel types to be used in the algorithm. Default is ['rbf'].
C (list): List of regularization parameters. Default is [1, 2, 4, 8, 10, 16, 32, 64, 100, 128, 1000].
gamma (list): List of kernel coefficient values. Default is [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0].
n_iter (int): Number of parameter settings that are sampled in random search. Default is 3.
cv (int): Number of cross-validation folds. Default is 5.
n_job (int): Number of jobs to run in parallel. Default is -1 (use all processors).
Returns:
SVC: The tuned SVM model.
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'kernel': kernel,
'C': C,
'gamma': gamma
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_svm, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_svm = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_svm, param_grid= paras, cv=cv, n_jobs=-1, scoring='accuracy', verbose=True)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_svm = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_svm
XGBoost
¶
A wrapper class for XGBoost classification, including model training, hyperparameter tuning, and classification of raster images.
Attributes:
| Name | Type | Description |
|---|---|---|
X_train |
array-like |
Training feature set. |
y_train |
array-like |
Training labels. |
X_test |
array-like |
Testing feature set. |
y_test |
array-like |
Testing labels. |
initial_xgb |
XGBClassifier or None |
The initial trained XGBoost model. |
tuned_xgb |
XGBClassifier or None |
The tuned XGBoost model (if tuning is performed). |
accuracy |
float |
Accuracy of the initial model on the test set. |
confusion_matrix |
ndarray |
Confusion matrix of the initial model. |
confusion_matrix_percent |
ndarray |
Normalized confusion matrix as percentages. |
classification_report |
str |
Classification report for the initial model. |
tuned_accuracy |
float |
Accuracy of the tuned model. |
tuned_confusion_matrix |
ndarray |
Confusion matrix of the tuned model. |
tuned_confusion_matrix_percent |
ndarray |
Normalized confusion matrix for the tuned model. |
tuned_classification_report |
str |
Classification report for the tuned model. |
Source code in geonate/classify.py
class XGBoost:
"""
A wrapper class for XGBoost classification, including model training, hyperparameter tuning, and classification of raster images.
Attributes:
X_train (array-like): Training feature set.
y_train (array-like): Training labels.
X_test (array-like): Testing feature set.
y_test (array-like): Testing labels.
initial_xgb (XGBClassifier or None): The initial trained XGBoost model.
tuned_xgb (XGBClassifier or None): The tuned XGBoost model (if tuning is performed).
accuracy (float): Accuracy of the initial model on the test set.
confusion_matrix (ndarray): Confusion matrix of the initial model.
confusion_matrix_percent (ndarray): Normalized confusion matrix as percentages.
classification_report (str): Classification report for the initial model.
tuned_accuracy (float): Accuracy of the tuned model.
tuned_confusion_matrix (ndarray): Confusion matrix of the tuned model.
tuned_confusion_matrix_percent (ndarray): Normalized confusion matrix for the tuned model.
tuned_classification_report (str): Classification report for the tuned model.
"""
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the XGBoost classifier with training and testing data, and automatically trains an initial model.
Args:
X_train (array-like): Training feature set.
y_train (array-like): Training labels.
X_test (array-like): Testing feature set.
y_test (array-like): Testing labels.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_xgb = None
self.tuned_xgb = None
# Automatically run the initial model
self.model()
# Initial model and validation
def model(self, **kwargs):
"""
Trains an initial XGBoost classifier using the provided training data and evaluates its performance on the test set.
Args:
**kwargs: Additional parameters to pass to XGBClassifier.
Returns:
XGBClassifier: The trained initial model.
"""
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
xgb = XGBClassifier(**kwargs)
xgb.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = xgb.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_xgb = xgb
return self.initial_xgb
# Tune the best parameters for classifier using random search or grid search methods
def tune(self, method="random", n_estimators=[100, 200, 300, 500, 1000], max_depth=[3, 5, 7, 9], learning_rate=[0.0001, 0.001, 0.01, 0.1], subsample=[0.5, 0.7, 1], n_iter=5, cv=5, n_job=-1):
"""
Tunes the hyperparameters of the XGBoost classifier using either RandomizedSearchCV or GridSearchCV.
Args:
method (str, optional): Search method, either "random" (default) or "grid".
n_estimators (list, optional): List of values for the number of trees.
max_depth (list, optional): List of values for the maximum tree depth.
learning_rate (list, optional): List of learning rates.
subsample (list, optional): List of subsampling ratios.
n_iter (int, optional): Number of iterations for random search (ignored for grid search).
cv (int, optional): Number of cross-validation folds.
n_job (int, optional): Number of parallel jobs (currently not used in the function).
Returns:
Best estimator from tuning process (XGBClassifier).
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'n_estimators': n_estimators,
'max_depth': max_depth,
'learning_rate': learning_rate,
'subsample': subsample
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_xgb, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_xgb = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_xgb, param_grid= paras, cv=cv, scoring='accuracy', verbose=True, n_jobs=-1)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_xgb = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_xgb
# Classify image
def classify(self, src, model=None):
"""
Classifies an input raster image using the trained XGBoost model.
Args:
src (rasterio.DatasetReader): The source raster image to classify.
model (XGBClassifier, optional): The model to use for classification. If not provided, the tuned model is used (or the initial model if tuning was not performed).
Returns:
rasterio.io.MemoryFile: The classified raster image.
"""
import rasterio
from geonate.common import reshape_raster, array2raster
# Define the random forest model to use
if model is not None:
XGB_model = model
else:
XGB_model = self.tuned_xgb if self.tuned_xgb is not None else self.initial_xgb
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the defined model
pred_labels = XGB_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
__init__(self, X_train, y_train, X_test, y_test)
special
¶
Initializes the XGBoost classifier with training and testing data, and automatically trains an initial model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X_train |
array-like |
Training feature set. |
required |
y_train |
array-like |
Training labels. |
required |
X_test |
array-like |
Testing feature set. |
required |
y_test |
array-like |
Testing labels. |
required |
Source code in geonate/classify.py
def __init__(self, X_train, y_train, X_test, y_test):
"""
Initializes the XGBoost classifier with training and testing data, and automatically trains an initial model.
Args:
X_train (array-like): Training feature set.
y_train (array-like): Training labels.
X_test (array-like): Testing feature set.
y_test (array-like): Testing labels.
"""
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
self.initial_xgb = None
self.tuned_xgb = None
# Automatically run the initial model
self.model()
classify(self, src, model=None)
¶
Classifies an input raster image using the trained XGBoost model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
src |
rasterio.DatasetReader |
The source raster image to classify. |
required |
model |
XGBClassifier |
The model to use for classification. If not provided, the tuned model is used (or the initial model if tuning was not performed). |
None |
Returns:
| Type | Description |
|---|---|
rasterio.io.MemoryFile |
The classified raster image. |
Source code in geonate/classify.py
def classify(self, src, model=None):
"""
Classifies an input raster image using the trained XGBoost model.
Args:
src (rasterio.DatasetReader): The source raster image to classify.
model (XGBClassifier, optional): The model to use for classification. If not provided, the tuned model is used (or the initial model if tuning was not performed).
Returns:
rasterio.io.MemoryFile: The classified raster image.
"""
import rasterio
from geonate.common import reshape_raster, array2raster
# Define the random forest model to use
if model is not None:
XGB_model = model
else:
XGB_model = self.tuned_xgb if self.tuned_xgb is not None else self.initial_xgb
# Define input parameters
if not isinstance(src, rasterio.DatasetReader):
raise ValueError('Source image is not supported')
else:
src_meta = src.meta
nbands = src.count
src_height = src.height
src_width = src.width
src_rast = src.read()
# Reshape and flatten data
src_img = reshape_raster(src_rast, mode='image')
ds = src_img.reshape((-1, nbands))
# Predict labels using the defined model
pred_labels = XGB_model.predict(ds)
# Reshape data and convert to raster format
pred_result = pred_labels.reshape(src_height, src_width)
src_meta.update({'count': 1})
classified = array2raster(pred_result, metadata=src_meta)
return classified
model(self, **kwargs)
¶
Trains an initial XGBoost classifier using the provided training data and evaluates its performance on the test set.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
**kwargs |
Additional parameters to pass to XGBClassifier. |
{} |
Returns:
| Type | Description |
|---|---|
XGBClassifier |
The trained initial model. |
Source code in geonate/classify.py
def model(self, **kwargs):
"""
Trains an initial XGBoost classifier using the provided training data and evaluates its performance on the test set.
Args:
**kwargs: Additional parameters to pass to XGBClassifier.
Returns:
XGBClassifier: The trained initial model.
"""
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize model and fit the model
xgb = XGBClassifier(**kwargs)
xgb.fit(self.X_train, self.y_train)
# Validate the initial model and return validation metrics
y_pred = xgb.predict(self.X_test)
self.accuracy = accuracy_score(self.y_test, y_pred)
self.confusion_matrix = confusion_matrix(self.y_test, y_pred)
self.confusion_matrix_percent = self.confusion_matrix.astype(float) / self.confusion_matrix.sum(axis=1, keepdims=True) * 100
self.classification_report = classification_report(self.y_test, y_pred)
self.initial_xgb = xgb
return self.initial_xgb
tune(self, method='random', n_estimators=[100, 200, 300, 500, 1000], max_depth=[3, 5, 7, 9], learning_rate=[0.0001, 0.001, 0.01, 0.1], subsample=[0.5, 0.7, 1], n_iter=5, cv=5, n_job=-1)
¶
Tunes the hyperparameters of the XGBoost classifier using either RandomizedSearchCV or GridSearchCV.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
method |
str |
Search method, either "random" (default) or "grid". |
'random' |
n_estimators |
list |
List of values for the number of trees. |
[100, 200, 300, 500, 1000] |
max_depth |
list |
List of values for the maximum tree depth. |
[3, 5, 7, 9] |
learning_rate |
list |
List of learning rates. |
[0.0001, 0.001, 0.01, 0.1] |
subsample |
list |
List of subsampling ratios. |
[0.5, 0.7, 1] |
n_iter |
int |
Number of iterations for random search (ignored for grid search). |
5 |
cv |
int |
Number of cross-validation folds. |
5 |
n_job |
int |
Number of parallel jobs (currently not used in the function). |
-1 |
Returns:
| Type | Description |
|---|---|
Best estimator from tuning process (XGBClassifier). |
Source code in geonate/classify.py
def tune(self, method="random", n_estimators=[100, 200, 300, 500, 1000], max_depth=[3, 5, 7, 9], learning_rate=[0.0001, 0.001, 0.01, 0.1], subsample=[0.5, 0.7, 1], n_iter=5, cv=5, n_job=-1):
"""
Tunes the hyperparameters of the XGBoost classifier using either RandomizedSearchCV or GridSearchCV.
Args:
method (str, optional): Search method, either "random" (default) or "grid".
n_estimators (list, optional): List of values for the number of trees.
max_depth (list, optional): List of values for the maximum tree depth.
learning_rate (list, optional): List of learning rates.
subsample (list, optional): List of subsampling ratios.
n_iter (int, optional): Number of iterations for random search (ignored for grid search).
cv (int, optional): Number of cross-validation folds.
n_job (int, optional): Number of parallel jobs (currently not used in the function).
Returns:
Best estimator from tuning process (XGBClassifier).
"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
paras = [{
'n_estimators': n_estimators,
'max_depth': max_depth,
'learning_rate': learning_rate,
'subsample': subsample
}]
if method.lower() == 'random' or method.lower() =='randomized' or method.lower() =='randomizedsearch' or method.lower() =='randomizedsearchcv':
random_searched = RandomizedSearchCV(estimator= self.initial_xgb, param_distributions=paras, n_iter= n_iter, scoring='accuracy', verbose=True)
random_searched.fit(self.X_train, self.y_train)
tuned_model = random_searched
self.tuned_xgb = tuned_model
elif method.lower() == 'grid' or method.lower() == 'gridsearch' or method.lower() == 'gridsearchcv':
grid_search = GridSearchCV(estimator= self.initial_xgb, param_grid= paras, cv=cv, scoring='accuracy', verbose=True, n_jobs=-1)
grid_search.fit(self.X_train, self.y_train)
tuned_model = grid_search
self.tuned_xgb = tuned_model
else:
raise ValueError('Tune method is not supported, the current methods are "randomizedsearch" and "gridsearchcv"')
# Validate the initial model and return validation metrics
tuned_y_pred = tuned_model.predict(self.X_test)
self.tuned_accuracy = accuracy_score(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix = confusion_matrix(self.y_test, tuned_y_pred)
self.tuned_confusion_matrix_percent = self.tuned_confusion_matrix.astype(float) / self.tuned_confusion_matrix.sum(axis=1, keepdims=True) * 100
self.tuned_classification_report = classification_report(self.y_test, tuned_y_pred)
return self.tuned_xgb
kmeans(input, n_cluster=3, max_iter=500, algorithm='lloyd', **kwargs)
¶
Perform K-Means clustering for raster image. Kmeans is a fast and simple algorithm.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
input |
rasterio.DatasetReader or np.ndarray |
Multispectral input data. Can be a raster image or a numpy array. |
required |
n_cluster |
int |
Number of clusters to form. Default is 3. |
3 |
max_iter |
int |
Maximum number of iterations of the k-means algorithm for a single run. Default is 300. |
500 |
algorithm |
str |
K-means algorithm to use. The available algorithms include "lloyd" and "elkan". "elkan" variation can be more efficient on some datasets with well-defined clusters, by using the triangle inequality. However it’s more memory intensive due to the allocation of an extra array of shape. Default is 'lloyd'. |
'lloyd' |
**kwargs |
Additional keyword arguments to pass to the KMeans model. |
{} |
Returns:
| Type | Description |
|---|---|
np.ndarray or rasterio.DatasetReader |
K-Means clustering result in the same format as the input. |
Source code in geonate/classify.py
def kmeans(input, n_cluster=3, max_iter=500, algorithm='lloyd', **kwargs):
"""
Perform K-Means clustering for raster image. Kmeans is a fast and simple algorithm.
Args:
input (rasterio.DatasetReader or np.ndarray): Multispectral input data. Can be a raster image or a numpy array.
n_cluster (int): Number of clusters to form. Default is 3.
max_iter (int): Maximum number of iterations of the k-means algorithm for a single run. Default is 300.
algorithm (str): K-means algorithm to use. The available algorithms include "lloyd" and "elkan". "elkan" variation can be more efficient on some datasets with well-defined clusters, by using the triangle inequality. However it’s more memory intensive due to the allocation of an extra array of shape. Default is 'lloyd'.
**kwargs: Additional keyword arguments to pass to the KMeans model.
Returns:
np.ndarray or rasterio.DatasetReader: K-Means clustering result in the same format as the input.
"""
import numpy as np
import rasterio
from sklearn.cluster import KMeans
from .common import array2raster, reshape_raster
# Identify datatype and define input data
# Raster image
if isinstance(input, rasterio.DatasetReader):
arr = input.read()
height, width = input.shape
nbands = input.count
meta = input.meta
# Data Array
elif isinstance(input, np.ndarray):
if len(input.shape) < 3:
raise ValueError('Input must be multispectral data (multi-band)')
else:
arr = input
nbands, height, width = input.shape
else:
raise ValueError('Input is not supported')
# Reshape from raster to image format, and from 3D to 2D
arr_reshape_img = reshape_raster(arr, mode='image')
print(arr_reshape_img.shape)
img_reshaped = arr_reshape_img.reshape((-1, nbands))
# Define KMeans model and fit the KMeans model
kmean_model = KMeans(n_clusters= n_cluster, max_iter= max_iter, algorithm= algorithm, **kwargs)
kmean_fit = kmean_model.fit(img_reshaped)
# Extract labels and reshape based on input image
labels = kmean_fit.labels_
km_results = labels.reshape((height, width))
# Return output based on input similar to input
if isinstance(input, np.ndarray):
return km_results
elif isinstance(input, rasterio.DatasetReader):
meta.update({'count': 1})
km_results_rast = array2raster(km_results, meta)
return km_results_rast