GeoExamples: GDAL performance: raster classification using NumPy

Monday, June 3, 2013

GDAL performance: raster classification using NumPy

I wrote some time ago two posts about raster classification and how to colorize a raster using GDAL. Both examples work well, but they are terribly inefficient and won't process a really big raster.
In this post, I'll show how to work properly with rasters, specially with big rasters or when the processing time matters, such as on the fly raster generation processes.
Before continuing, I recommend to read this course about Geoprocessing by Chris Garrard, that helped me a lot.

The classified raster seen on QGIS

As usual, you can find all the code at GitHub

The original code

To learn about the major errors in my former posts, the best is to look at the code:

#! /usr/bin/python

#Change the value with your raster filename here
raster_file = 'w001001.adf'
output_file = 'classified.tiff'

classification_values = [0,500,1000,1500,2000,2500,3000,3500,4000] #The interval values to classify
classification_output_values = [10,20,30,40,50,60,70,80,90] #The value assigned to each interval

from osgeo import gdal
from osgeo.gdalconst import *
import numpy
import struct

#Opening the raster file
dataset = gdal.Open(raster_file, GA_ReadOnly )
band = dataset.GetRasterBand(1)
#Reading the raster properties
projectionfrom = dataset.GetProjection()
geotransform = dataset.GetGeoTransform()
xsize = band.XSize
ysize = band.YSize
datatype = band.DataType

#Reading the raster values
values = band.ReadRaster( 0, 0, xsize, ysize, xsize, ysize, datatype )
#Conversion between GDAL types and python pack types (Can't use complex integer or float!!)
data_types ={'Byte':'B','UInt16':'H','Int16':'h','UInt32':'I','Int32':'i','Float32':'f','Float64':'d'}
values = struct.unpack(data_types[gdal.GetDataTypeName(band.DataType)]*xsize*ysize,values)

#Now that the raster is into an array, let's classify it
out_str = ''
for value in values:
    index = 0
    for cl_value in classification_values:
        if value <= cl_value:
            out_str = out_str + struct.pack('B',classification_output_values[index])
            break
        index = index + 1
#Once classified, write the output raster
#In the example, it's not possible to use the same output format than the input file, because GDAL is not able to write this file format. Geotiff will be used instead
gtiff = gdal.GetDriverByName('GTiff') 
output_dataset = gtiff.Create(output_file, xsize, ysize, 4)
output_dataset.SetProjection(projectionfrom)
output_dataset.SetGeoTransform(geotransform)

output_dataset.GetRasterBand(1).WriteRaster( 0, 0, xsize, ysize, out_str ) 
output_dataset = None

Reading the raster without using the ReadAsArray function (lines 26 to 29). Is more inefficient and really more complicated.
Iterating pixel by pixel to calculate the classified values instead of doing it at once using the NumPy arrays capabilities.
Classifying all the possible values, instead of looking for the maximum and minimum values of the raster and classify it only in this interval
Not reading the raster by block size, which is much more efficient in big rasters, and indispensable for the really big ones.

I'll explain how to change that point by point.

New code

The following code has all the changes together, but at the GitHub you can find different versions applying them one by one:

import numpy
from numpy import zeros
from numpy import logical_and
from osgeo import gdal

classification_values = [0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000] #The interval values to classify  
classification_output_values = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170] #The value assigned to each interval  
  
in_file = "YourInputFileHere.tif"

ds = gdal.Open(in_file)
band = ds.GetRasterBand(1)

block_sizes = band.GetBlockSize()
x_block_size = block_sizes[0]
y_block_size = block_sizes[1]

xsize = band.XSize
ysize = band.YSize

max_value = band.GetMaximum()
min_value = band.GetMinimum()

if max_value == None or min_value == None:
    stats = band.GetStatistics(0, 1)
    max_value = stats[1]
    min_value = stats[0]

format = "GTiff"
driver = gdal.GetDriverByName( format )
dst_ds = driver.Create("original_blocks.tif", xsize, ysize, 1, gdal.GDT_Byte )
dst_ds.SetGeoTransform(ds.GetGeoTransform())
dst_ds.SetProjection(ds.GetProjection())

for i in range(0, ysize, y_block_size):
    if i + y_block_size < ysize:
        rows = y_block_size
    else:
        rows = ysize - i
    for j in range(0, xsize, x_block_size):
        if j + x_block_size < xsize:
            cols = x_block_size
        else:
            cols = xsize - j

        data = band.ReadAsArray(j, i, cols, rows)
        r = zeros((rows, cols), numpy.uint8)

        for k in range(len(classification_values) - 1):
            if classification_values[k] <= max_value and (classification_values[k + 1] > min_value ):
                r = r + classification_output_values[k] * logical_and(data >= classification_values[k], data < classification_values[k + 1])
        if classification_values[k + 1] < max_value:
            r = r + classification_output_values[k+1] * (data >= classification_values[k + 1])

        dst_ds.GetRasterBand(1).WriteArray(r,j,i)

dst_ds = None

At line 46, you can see how ReadAsArray works. No conversion to NumPy array has to be done, and no types are needed to be specified. The example works using blocks, but you can read the whole array just by changing the parameters to ReadAsArray (0, 0, xsize, ysize).
This is the main change.

At line 47, an empty array is created, specifying that the output will be an array of unsigned bytes (the most efficient for classifications up to 256 values)
For every possible value in the classification (line 49), the input raster is evaluated, using the NumPy function logical_and. This creates an array with a boolean value set to true for each pixel that meets the condition. Then, the array is multiplied to the output classification value and added to the output array.
When this is done for each value in the scale, the output classified raster is get.
To see better how to do it, take a look at the file classification_numpy_arrays.py, since working by blocks makes things a bit more difficult to understand.

At lines 21 to 27, the maximum and minimum values are calculated. Note the if block. Some formats store these values, and others, like GeoTIFF, don't. But GDAL can calculate them, and stores the values in a file with the extension .aux.xml, so the next time the calculation is not needed.
All this takes a time, but then, at line 50, the if can be done, and the process improves a lot if there are many values to classify out of the image range.

Performance improvements

To see how does the new code affect the performance I needed bigger rasters than at the old example. The example files I have downloaded are from the CGIAR SRTM elevation data. I have chosen a region from the Himalaya (srtm_54_05.tif).

The results for this file are:

classification_original.py: 2m22.047s
classification_numpy_array.py: 0m32.359s
classification_blocks.py: 0m17.900s
classification_blocks_minmax.py: 0m14.223s

to test the time I ahve just run the script with using time:
time python classification_original.py

So the biggest improvement comes from using NumPy arrays instead of reading the file pixel by pixel. Then, using blocks instead of reading the whole array, also improves the performance more than a 40%, since the file is quite big (69MB)
Using the maximum and minimum can change the efficiency depending of the raster range.

But to see the importance of using blocks, I have merged four images from the same source, using:
gdal_merge.py -o big.tiff srtm*.tif

So the resulting GeoTIFF is 275MB, too big for my old computer to read it at once. The results are:

classification_original.py: 4m41.438s + Memory Error
classification_numpy_array.py: 1m5.858s + Memory Error
classification_blocks.py: 1m11.053s
classification_blocks_minmax.py: 0m50.891s

As you can see, not reading by block can make the script crash, so if you can have big files, it's mandatory.
And using all the improvements, the time is still less than the half it lasted the original script to process a raster 75% smaller!

Next post, I will explain how to colour the rasters using color tables or SciPy to generate PNG ths fast way.

25 comments:

UnknownApril 28, 2014 at 6:39 AM
I have found that combining gdalbuildvrt and gdal_translate is much better for performance when merging the blocks back together then gdal_merge.py. It also avoids the memory problems that gdal_merge.py has. The only downside is that it adds an extra step (albeit a short and simple one).
e.g. instead of "gdal_merge.py -o big.tiff srtm*.tif" use the following two lines:
gdalbuildvrt srtm*.tif big.vrt
gdal_translate -of GTiff big.vrt big.tif

I have been curious about the different types of rasters and their comparative read/write speeds for the intermediate steps (with gdal), have you used anything apart from GeoTIFFs for processing? I am working with 60GB compressed GeoTIFFs at the moment.
ReplyDelete
Replies
UnknownApril 28, 2014 at 1:10 PM
Hi mr Franklin :)

I have tested the scripts in several formats, but not with files as big as 60GB. The performance can change depending of the format, since each driver implements the rading block in a different way. GeoTIFF uses one line for each block, but you can define blocks of other sizes, let's say 256x256 pixels in the driver. The original example uses data from the usgs in AIG/Arc/Info Binary Grid, that uses 4 lines as a block. The driver that writes the output file can affect the result too for the same reasons, of course.
ReplyDelete
Replies
BenjaminAugust 28, 2014 at 3:20 PM
Hi,

I am testing your code and it works like a charm. I have only one question: How to get negaitve output values. Whatever I am trying, a negative value in the classification_output_values results in a zero on the output .tif.

I guess the problem lies here

r = r + classification_output_values[k] * logical_and(data >= classification_values[k], data < classification_values[k + 1])

but I don't understand why a multiplication of a negative value and True results in zero. I am not a python expert so I am asking you for help.

Cheers,
Benjamin
ReplyDelete
Replies
UnknownAugust 28, 2014 at 10:24 PM
Hi Benjamin,

I'm not at home now, so I can't test the code, but a negative number multiplied by True should give the negative value, since True acts as 1.
Maybe the problem is when using Numpy. I'll take a look, sorry for the delay.

Roger
ReplyDelete
Replies
FlashSeptember 28, 2014 at 4:54 PM
Hello, I'm from Brazil and I'm starting studies with GDAL and python, created the following scritp to qgis for NDVI classification on a raster

This code works correctly for small images

The problem happens with memory error when I try to process large raster

This is very important to me, did not quite understand the logic of the code that you have implemented, I tried to adapt your code in my code, but I did not succeed.

Could you help me create a code that performs the same function that I developed, only for large raster?

thank you so much

##Raster_de_entrada=raster
##NDVI=output raster

import numpy as np
from osgeo import gdal
from numpy import *

entrada = gdal.Open(Raster_de_entrada)
canalr = entrada.GetRasterBand(1)
bandar = canalr.ReadAsArray()
canalg = entrada.GetRasterBand(2)
bandag = canalg.ReadAsArray()

# Inicio do calculo NDVI para imagens RGB
# NDVI = (G - R) / (G + R)

bandag = array(bandag, dtype = float)
bandar = array(bandar, dtype = float)

var1 = subtract(bandag, bandar)
var2 = add(bandag, bandar)

ndvi = divide(var1,var2)
ndvi = ndvi*255

geo = entrada.GetGeoTransform()
proj = entrada.GetProjection()
shape = bandar.shape

driver = gdal.GetDriverByName('GTiff')
dst_ds = driver.Create(NDVI, shape[1], shape[0], 1, gdal.GDT_Float32, ['TFW=YES', \
'COMPRESS=LZW', 'TILED=YES'] )

dst_ds.SetGeoTransform( geo )
dst_ds.SetProjection( proj )

dst_ds.GetRasterBand(1).WriteArray(ndvi)
stat = dst_ds.GetRasterBand(1).GetStatistics(1,1)
dst_ds.GetRasterBand(1).SetStatistics(stat[0], stat[1], stat[2], stat[3])
ReplyDelete
Replies
HuguesJanuary 12, 2015 at 3:41 PM
Hello,

Thanks for your work which is very useful to reclass rasters and which learnt me a lot about the use of blocks. However, I think line 44 should be cols = xsize - j, shouldn't it ?

But if I'm not wrong no one noticed this ine the previous post. A question raises : is there something I don't understand ?
ReplyDelete
Replies
UnknownJanuary 22, 2015 at 3:11 PM
This comment has been removed by the author.
ReplyDelete
Replies
BenjaminFebruary 12, 2015 at 12:20 PM
Hi Roger,

after using your code for a while I found possible problems with the max_value.
In my specific setup I use the classification for a large number of files. They do not always fill the complete range of classification_values, which normally is not problem. Recently I had a file where the max_value was exactly one of my classification values which lead to a 0 after the reclassification.
So I propose to change line 50 to:
if classification_values[k] <= max_value and (classification_values[k + 1] > min_value ):
ReplyDelete
Replies
UnknownFebruary 12, 2015 at 2:48 PM
Hi Benjamin,

You are right, there is a bug here. I'll change it when I have a moment!
I'm happy to know that somebody us using it.
ReplyDelete
Replies
findchris2daySeptember 22, 2015 at 9:19 AM
This comment has been removed by the author.
ReplyDelete
Replies
ppaschOctober 22, 2015 at 4:24 PM
Hi Roger,
I've tried to use your code to calculate distance for the centroid of each grid cell (10000x10000) to one point and unfortunately I didn't manage to do it using the "block method".
I don't really understand how to get each centroid coordinates using blocks?
Do you think it is possible ?

thanks
ReplyDelete
Replies
UnknownSeptember 23, 2016 at 3:16 PM
Hi, I am tring to use your code to read my big raster image about 8gb. May be I am doing something wrong. I do not need primarily any classification. I just want to get return the original raster values in the array. But it iy showing all values are only Zer. Could you kindly check my code where i did mistakes.
my code is here:
import numpy
from numpy import zeros
from numpy import logical_and
from osgeo import gdal
#import struct

in_file = r"F:\Geodaten\Eigene\Luftbild\Orthobilder\20120502\Aschhorn_mosaik.jpg"
ds = gdal.Open(in_file)
band = ds.GetRasterBand(1)
# Get "natural" block size, and total raster XY size.
block_sizes = band.GetBlockSize()
x_block_size = block_sizes[0]
y_block_size = block_sizes[1]
xsize = band.XSize
ysize = band.YSize

format = "GTiff"
driver = gdal.GetDriverByName( format )
dst_ds = driver.Create("original_blocks.tif", xsize, ysize, 1, gdal.GDT_Byte )

dst_ds.SetGeoTransform(ds.GetGeoTransform())
dst_ds.SetProjection(ds.GetProjection())

# Function to read the raster as arrays for the chosen block size.
def read_raster(x_block_size, y_block_size):
raster = r"F:\Geodaten\Eigene\Luftbild\Orthobilder\20120502\Aschhorn_mosaik.jpg"
ds = gdal.Open(raster)
band = ds.GetRasterBand(1)
xsize = band.XSize
ysize = band.YSize
blocks = 0
for y in xrange(0, ysize, y_block_size):
if y + y_block_size < ysize:
rows = y_block_size
else:
rows = ysize - y
for x in xrange(0, xsize, x_block_size):
if x + x_block_size < xsize:
cols = x_block_size
else:
cols = xsize - x
array = band.ReadAsArray(x, y, cols, rows)
print array
dst_ds.GetRasterBand(1).WriteArray(array)
del array
blocks += 1
band = None
ds = None
#dst_ds.GetRasterBand(1).WriteArray(data)

read_raster(x_block_size, y_block_size)
band = None
ds = None
dst_ds = None
ReplyDelete
Replies
UnknownSeptember 23, 2016 at 3:23 PM
Hi,
Does your file Aschhorn_mosaik.jpg have a worldfile? I think that adding a projection to a jpeg file is not possible. You shoud create it yourself and change the line
ds.GetProjection()
and
ds.GetGeoTransform()
ReplyDelete
Replies
AnonymousAugust 17, 2017 at 8:47 AM
Hi,
I am new to this , can you help me please . Here the link and I wanted to get all the records into a DB ( https://www.fs.usda.gov/rds/fedora/objects/RDS:RDS-2015-0046/datastreams/RDS-2015-0046/content )

If you can help me that would be a great
ReplyDelete
Replies

Add comment