Showing posts with label raster. Show all posts
Showing posts with label raster. Show all posts

Saturday, June 29, 2013

GDAL Performance II: Create a PNG from a raster file

After improving the raster classification script, in this entry, is the turn for the PNG creation script.

As usual, all the code is at GitHub

Problems in the original script

  • The PIL python module was used to draw the PNG image pixel by pixel. This can be changed using the SciPy function scipy.misc.imsave. Using it, all the iterations, comparisons, etc. are done at a lower level, so the script is cleaner and much faster.
  • GDAL blocks weren't used, so all the image was loaded into a huge array if the original raster was big. Using blocks as in the last entry example, performance improves a lot.
  • All the values in the color file were evaluated. If the minimum and maximum values of the raster are known, many of these comparisons can be avoided, so the performance there is also improved.

The new script

The new script works exactly the same way as the old one at the user level.
'''
Script to generate a png from a raster file.
It can be used from the command line or from an other python script
'''
from os.path import exists
from osgeo import gdal
from osgeo.gdalconst import GA_ReadOnly
from sys import argv
from sys import exit
from numpy import logical_and
from numpy import zeros
from numpy import uint8
import scipy

'''
Main function.
Requires the input file, the color file and the otuput file name. 
The raster band is 1 by default. 
If nothing is asked, the discrete colroscale image is created.
'''
def raster2png(raster_file, color_file, out_file_name, raster_band=1, discrete=True):

    if exists(raster_file) is False:
            raise Exception('[Errno 2] No such file or directory: \'' + raster_file + '\'')    
    
    dataset = gdal.Open(raster_file, GA_ReadOnly )
    if dataset == None:
        raise Exception("Unable to read the data file")
    
    band = dataset.GetRasterBand(raster_band)

    block_sizes = band.GetBlockSize()
    x_block_size = block_sizes[0]
    y_block_size = block_sizes[1]

    xsize = band.XSize
    ysize = band.YSize

    max_value = band.GetMaximum()
    min_value = band.GetMinimum()

    if max_value == None or min_value == None:
        stats = band.GetStatistics(0, 1)
        max_value = stats[1]
        min_value = stats[0]

    #Reading the color table
    color_table = readColorTable(color_file)
    #Adding an extra value to avoid problems with the last & first entry
    if sorted(color_table.keys())[0] > min_value:
        color_table[min_value - 1] = color_table[sorted(color_table.keys())[0]]

    if sorted(color_table.keys())[-1] < max_value:
        color_table[max_value + 1] = color_table[sorted(color_table.keys())[-1]]
    #Preparing the color table and the output file
    classification_values = color_table.keys()
    classification_values.sort()
    

    rgb = zeros((ysize, xsize, 4), dtype = uint8)

    for i in range(0, ysize, y_block_size):
        if i + y_block_size < ysize:
            rows = y_block_size
        else:
            rows = ysize - i
        
        for j in range(0, xsize, x_block_size):
            if j + x_block_size < xsize:
                cols = x_block_size
            else:
                cols = xsize - j
                
            
            values = band.ReadAsArray(j, i, cols, rows)
            r = zeros((rows, cols), dtype = uint8)
            g = zeros((rows, cols), dtype = uint8)
            b = zeros((rows, cols), dtype = uint8)
            a = zeros((rows, cols), dtype = uint8)

            for k in range(len(classification_values) - 1):
                #print classification_values[k]
                if classification_values[k] < max_value and (classification_values[k + 1] > min_value ):
                    mask = logical_and(values >= classification_values[k], values < classification_values[k + 1])
                    if discrete == True:
                        r = r + color_table[classification_values[k]][0] * mask
                        g = g + color_table[classification_values[k]][1] * mask
                        b = b + color_table[classification_values[k]][2] * mask
                        a = a + color_table[classification_values[k]][3] * mask
                    else:
                        v0 = float(classification_values[k])
                        v1 = float(classification_values[k + 1])

                        r = r + mask * (color_table[classification_values[k]][0] + (values - v0)*(color_table[classification_values[k + 1]][0] - color_table[classification_values[k]][0])/(v1-v0) )
                        g = g + mask * (color_table[classification_values[k]][1] + (values - v0)*(color_table[classification_values[k + 1]][1] - color_table[classification_values[k]][1])/(v1-v0) )
                        b = b + mask * (color_table[classification_values[k]][2] + (values - v0)*(color_table[classification_values[k + 1]][2] - color_table[classification_values[k]][2])/(v1-v0) )
                        a = a + mask * (color_table[classification_values[k]][3] + (values - v0)*(color_table[classification_values[k + 1]][3] - color_table[classification_values[k]][3])/(v1-v0) )
                     
            rgb[i:i+rows,j:j+cols, 0] = r 
            rgb[i:i+rows,j:j+cols, 1] = g 
            rgb[i:i+rows,j:j+cols, 2] = b
            rgb[i:i+rows,j:j+cols, 3] = a   

    scipy.misc.imsave(out_file_name, rgb)
    

def readColorTable(color_file):
    '''
    The method for reading the color file.
    * If alpha is not defined, a 255 value is set (no transparency).
    '''    

    color_table = {}
    if exists(color_file) is False:
        raise Exception("Color file " + color_file + " does not exist")
    
    fp = open(color_file, "r")
    for line in fp:
        if line.find('#') == -1 and line.find('/') == -1:
            entry = line.split()
            if len(entry) == 5:
                alpha = int(entry[4])
            else:
                alpha=255
            color_table[eval(entry[0])]=[int(entry[1]),int(entry[2]),int(entry[3]),alpha]
    fp.close()
    
    return color_table

'''
Usage explanation
'''
def Usage():
    print "Not enough arguments." 
    print "Usage:"
    print argv[0] + ' [-exact_color_entry] [-band=1] data_file color_file output_file'    
    exit()

'''
Program Mainline
'''
if __name__ == "__main__":
    
    file_name = None
    colorfile_name = None
    out_file_name = None
    discrete = False
    band = 1

    i = 1
    while i < len(argv):
        arg = argv[i]
        if arg == '-exact_color_entry':
            discrete = True
        elif arg == '-band':
            band = argv[i+1]
            i = i + 1
        elif file_name is None:
            file_name = arg
            file_name = file_name.replace("'","")
            file_name = file_name.replace('"','')
        elif colorfile_name is None:
            colorfile_name = arg
            colorfile_name = colorfile_name.replace("'","")
            colorfile_name = colorfile_name.replace('"','')
        elif out_file_name is None:
            out_file_name = arg
            out_file_name = out_file_name.replace("'","")
            out_file_name = out_file_name.replace('"','')
        i = i + 1   



    if len(argv) == 1 or file_name == None or colorfile_name == None or out_file_name == None: 
       Usage()     
    '''
    try:
        raster2png(file_name,colorfile_name,out_file_name,band,discrete)
    except Exception, ex:
        print "Error: " + str(ex)
    '''
    raster2png(file_name,colorfile_name,out_file_name,band,discrete)
  • The part of the code that reads the input (lines 130 - 182) hasn't changed, like the function readColorTable, that reads the color file so it can be processed. Only the main method raster2png has changed.
  • Lines 26 to 45 read the raster file metadata, including the extreme values in the file.
  • Then, the colorfile is read, and extra values are added if the raster file has values above the maximum value at the color file or below the minimum (lines 47-57). If this isn't done, some strange nodata zones will appear.
  • At line 60, an array is created, where the four bands (red, green, blue and alpha) of the output png will be stored.
  • Then, every block of data is evaluated (after line 62). Empty arrays are created for each band, and for every value in the colorfile, the affected pixels are chosen at line 84.
  • Depending on the user choice, the colour is assigned either directly from the colorfile value or interpolating from the two most close colors. (lines 85-97) Line 91 and 92 have a float cast, very important to get the right value.
  • Lines 99-103 assign the block to the right output array region (see how a region of a NumPy array is chosen)
  • Line 104 creates the png file.

Benchmarks

To check the results, I have used the same raster used at the original post, and for bigger stuff, the rasters I downloaded for the last post (read it for the big file creation).

Old script

Filewith -exact_color_entryContinuous colors
w001001.adf (1126x576px)9.363s13.329s
srtm_54_05.tif (6001x6001px)9m19s12m44s
big.tif (12001x12001px)ErrorError

New script

Filewith -exact_color_entryContinuous colors
w001001.adf (1126x576px)0.822s1.653s
srtm_54_05.tif (6001x6001px)22.503s1m07s
big.tif (12001x12001px)1m46s4m58s

The improvement is really big and, sometimes, the old version will crash because not enough memory will be available.

What's next

I'm not satisfied yet with the way the image is saved using SciPy, because all the image must be in a single array, instead of working by blocks. Maybe there's an alternative to do this more efficiently. With really big images, the script will fail even using the new script version.

Monday, June 3, 2013

GDAL performance: raster classification using NumPy

I wrote some time ago two posts about raster classification and how to colorize a raster using GDAL. Both examples work well, but they are terribly inefficient and won't process a really big raster.
In this post, I'll show how to work properly with rasters, specially with big rasters or when the processing time matters, such as on the fly raster generation processes.
Before continuing, I recommend to read this course about Geoprocessing by Chris Garrard, that helped me a lot.

The classified raster seen on QGIS
As usual, you can find all the code at GitHub

The original code

To learn about the major errors in my former posts, the best is to look at the code:
#! /usr/bin/python

#Change the value with your raster filename here
raster_file = 'w001001.adf'
output_file = 'classified.tiff'

classification_values = [0,500,1000,1500,2000,2500,3000,3500,4000] #The interval values to classify
classification_output_values = [10,20,30,40,50,60,70,80,90] #The value assigned to each interval

from osgeo import gdal
from osgeo.gdalconst import *
import numpy
import struct

#Opening the raster file
dataset = gdal.Open(raster_file, GA_ReadOnly )
band = dataset.GetRasterBand(1)
#Reading the raster properties
projectionfrom = dataset.GetProjection()
geotransform = dataset.GetGeoTransform()
xsize = band.XSize
ysize = band.YSize
datatype = band.DataType

#Reading the raster values
values = band.ReadRaster( 0, 0, xsize, ysize, xsize, ysize, datatype )
#Conversion between GDAL types and python pack types (Can't use complex integer or float!!)
data_types ={'Byte':'B','UInt16':'H','Int16':'h','UInt32':'I','Int32':'i','Float32':'f','Float64':'d'}
values = struct.unpack(data_types[gdal.GetDataTypeName(band.DataType)]*xsize*ysize,values)

#Now that the raster is into an array, let's classify it
out_str = ''
for value in values:
    index = 0
    for cl_value in classification_values:
        if value <= cl_value:
            out_str = out_str + struct.pack('B',classification_output_values[index])
            break
        index = index + 1
#Once classified, write the output raster
#In the example, it's not possible to use the same output format than the input file, because GDAL is not able to write this file format. Geotiff will be used instead
gtiff = gdal.GetDriverByName('GTiff') 
output_dataset = gtiff.Create(output_file, xsize, ysize, 4)
output_dataset.SetProjection(projectionfrom)
output_dataset.SetGeoTransform(geotransform)

output_dataset.GetRasterBand(1).WriteRaster( 0, 0, xsize, ysize, out_str ) 
output_dataset = None
  1. Reading the raster without using the ReadAsArray function (lines 26 to 29). Is more inefficient and really more complicated.
  2. Iterating pixel by pixel to calculate the classified values instead of doing it at once using the NumPy arrays capabilities.
  3. Classifying all the possible values, instead of looking for the maximum and minimum values of the raster and classify it only in this interval
  4. Not reading the raster by block size, which is much more efficient in big rasters, and indispensable for the really big ones.
I'll explain how to change that point by point.

New code

The following code has all the changes together, but at the GitHub you can find different versions applying them one by one:

import numpy
from numpy import zeros
from numpy import logical_and
from osgeo import gdal

classification_values = [0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000] #The interval values to classify  
classification_output_values = [10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170] #The value assigned to each interval  
  
in_file = "YourInputFileHere.tif"

ds = gdal.Open(in_file)
band = ds.GetRasterBand(1)

block_sizes = band.GetBlockSize()
x_block_size = block_sizes[0]
y_block_size = block_sizes[1]

xsize = band.XSize
ysize = band.YSize

max_value = band.GetMaximum()
min_value = band.GetMinimum()

if max_value == None or min_value == None:
    stats = band.GetStatistics(0, 1)
    max_value = stats[1]
    min_value = stats[0]

format = "GTiff"
driver = gdal.GetDriverByName( format )
dst_ds = driver.Create("original_blocks.tif", xsize, ysize, 1, gdal.GDT_Byte )
dst_ds.SetGeoTransform(ds.GetGeoTransform())
dst_ds.SetProjection(ds.GetProjection())

for i in range(0, ysize, y_block_size):
    if i + y_block_size < ysize:
        rows = y_block_size
    else:
        rows = ysize - i
    for j in range(0, xsize, x_block_size):
        if j + x_block_size < xsize:
            cols = x_block_size
        else:
            cols = xsize - j

        data = band.ReadAsArray(j, i, cols, rows)
        r = zeros((rows, cols), numpy.uint8)

        for k in range(len(classification_values) - 1):
            if classification_values[k] <= max_value and (classification_values[k + 1] > min_value ):
                r = r + classification_output_values[k] * logical_and(data >= classification_values[k], data < classification_values[k + 1])
        if classification_values[k + 1] < max_value:
            r = r + classification_output_values[k+1] * (data >= classification_values[k + 1])

        dst_ds.GetRasterBand(1).WriteArray(r,j,i)

dst_ds = None


  1. At line 46, you can see how ReadAsArray works. No conversion to NumPy array has to be done, and no types are needed to be specified. The example works using blocks, but you can read the whole array just by changing the parameters to ReadAsArray (0, 0, xsize, ysize).
  2. This is the main change.
    1. At line 47, an empty array is created, specifying that the output will be an array of unsigned bytes (the most efficient for classifications up to 256 values)
    2. For every possible value in the classification (line 49), the input raster is evaluated, using the NumPy function logical_and. This creates an array with a boolean value set to true for each pixel that meets the condition. Then, the array is multiplied to the output classification value and added to the output array.
      When this is done for each value in the scale, the output classified raster is get.
    3. To see better how to do it, take a look at the file classification_numpy_arrays.py, since working by blocks makes things a bit more difficult to understand.
  3. At lines 21 to 27, the maximum and minimum values are calculated. Note the if block. Some formats store these values, and others, like GeoTIFF, don't. But GDAL can calculate them, and stores the values in a file with the extension .aux.xml, so the next time the calculation is not needed.
    All this takes a time, but then, at line 50, the if can be done, and the process improves a lot if there are many values to classify out of the image range.

Performance improvements

To see how does the new code affect the performance I needed bigger rasters than at the old example. The example files I have downloaded are from the CGIAR SRTM elevation data. I have chosen a region from the Himalaya (srtm_54_05.tif).


The results for this file are:
  1. classification_original.py:  2m22.047s
  2. classification_numpy_array.py:  0m32.359s
  3. classification_blocks.py:  0m17.900s
  4. classification_blocks_minmax.py: 0m14.223s

to test the time I ahve just run the script with using time:
time python classification_original.py

So the biggest improvement comes from using NumPy arrays instead of reading the file pixel by pixel.  Then, using blocks instead of reading the whole array, also improves the performance more than a 40%, since the file is quite big (69MB)
Using the maximum and minimum can change the efficiency depending of the raster range.

But to see the importance of using blocks, I have merged four images from the same source, using:
gdal_merge.py -o big.tiff srtm*.tif

So the resulting GeoTIFF is 275MB, too big for my old computer to read it at once. The results are:
  1. classification_original.py: 4m41.438s + Memory Error
  2. classification_numpy_array.py:  1m5.858s + Memory Error
  3. classification_blocks.py: 1m11.053s
  4. classification_blocks_minmax.py: 0m50.891s
As you can see, not reading by block can make the script crash, so if you can have big files, it's mandatory.
And using all the improvements, the time is still less than the half it lasted the original script to process a raster 75% smaller!

Next post, I will explain how to colour the rasters using color tables or SciPy  to generate PNG ths fast way.

Sunday, February 19, 2012

Colorize raster with GDAL python

As I was telling in my last post, what we usually want is to take a raster file, classify it, and output a png in a color scale.
The easiest way to do it, is using gdaldem with the color-relief option.
But is not in python.
So, if you want to integrate it in a script, or modify it's behavior easily, here you have a solution.
All the example data, including the whole script, dataset, etc, can be downloaded here.

The script:


The first thing to do is reading the color file. As in the gdaldem application, the file is plain text with four columns, value, red value, green value, blue value, and, optionally, the alpha value. We will consider alpha value to be 255 if is not set. The comments are indicated with the hash. This format is the one used in the GRASS r.colors utility
So a function is defined:

def readColorTable(color_file):
    color_table = {}
    if exists(color_file) is False:
        raise Exception("Color file " + color_file + " does not exist")
    
    fp = open(color_file, "r")
    for line in fp:
        if line.find('#') == -1 and line.find('/') == -1:
            entry = line.split()
            if len(entry) == 5:
                alpha = int(entry[4])
            else:
                alpha=0
            color_table[eval(entry[0])]=[int(entry[1]),int(entry[2]),int(entry[3]),alpha]
    fp.close()
    
    return color_table

Once the table is read, is necessary to read the band in the raster file:

from os.path import exists
from osgeo import gdal
from osgeo.gdalconst import GA_ReadOnly
from struct import unpack

data_types ={'Byte':'B','UInt16':'H','Int16':'h','UInt32':'I','Int32':'i','Float32':'f','Float64':'d'}
if exists(raster_file) is False:
    raise Exception('[Errno 2] No such file or directory: \'' + raster_file + '\'')    
    
dataset = gdal.Open(raster_file, GA_ReadOnly )
if dataset == None:
    raise Exception("Unable to read the data file")
band = dataset.GetRasterBand(raster_band)
values = band.ReadRaster( 0, 0, band.XSize, band.YSize, band.XSize, band.YSize, band.DataType )
values = unpack(data_types[gdal.GetDataTypeName(band.DataType)]*band.XSize*band.YSize,values)

Notice that the data types in the struct command and in the GDAL library haven't got the same names, so a dict has to be created to translate them. (Maybe there's a better way to do it, but this one works) A band has to be set. If the file has got only one band (as in our example), the value must be one. Then, two images must be created, one for the colors, an the other for the alpha channel:

import Image
import ImageDraw
base = Image.new( 'RGBA', (band.XSize,band.YSize) )
base_draw = ImageDraw.Draw(base)
alpha_mask = Image.new('L', (band.XSize,band.YSize), 255)
alpha_draw = ImageDraw.Draw(alpha_mask)

Now, the classification is done. There are two modes, the one that gives the exact color defined in the color file, and the one that calculates the color according to the value, and the two nearest colors in the palette. Here, to make more readable, only the exact color is shown:

import Image
color_table = readColorTable(color_file)
classification_values = color_table.keys()
classification_values.sort()
for pos in range(len(values)):
        y = pos/band.XSize
        x = pos - y * band.XSize
        for index in range(len(classification_values)):

            if values[pos] <= classification_values[index] or index == len(classification_values)-1:
                if index == 0:
                        index = 1
                    elif index == len(classification_values)-1 and values[pos] >= classification_values[index]:
                        index = index + 1
                    color = color_table[classification_values[index-1]]
                    base_draw.point((x,y), (color[0],color[1],color[2]))
                    alpha_draw.point((x,y),color[3])
                                    
                    base_draw.point((x,y), (int(r),int(g),int(b)))
                    alpha_draw.point((x,y),int(a))
                    
            break

The first thing to do is getting the values of the color table. Then, for each pixel, the pixel position is calculated. To know the color value to be used, is necessary to iterate all the values in the color table to see which one is just over the pixel value. There is a special case if the pixel value is under the minimum value of the scale. Then, the pixel is set in the color image and the alpha channel. See the break statement to exit the loop.
Once the image and the alpha channel are calculated, the alpha channel has to be blend with the color channel before saving the result:

color_layer = Image.new('RGBA', base.size, (255, 255, 255, 0))
base = Image.composite(color_layer, base, alpha_mask)
base.save(out_file_name)

And that's all.
In the example, everything is packed in a function, and is possible to call it from the command line.
To calculate the color in the -nearest_color_entry mode, the code would be like this for each channel:

r = color_table[classification_values[index-1]][0] + (values[pos] - classification_values[index-1])*(color_table[classification_values[index]][0] - color_table[classification_values[index-1]][0])/(classification_values[index]-classification_values[index-1])

Of course, there is also the special case when the pixel value is lower than the lowest value in the scale or higher than the higher one. The solution is in the script.

Running the code:

We will use the same data of the last post (elevation data from the USGS).
All the sample data and the script is here.
A color file example could be this one (for our elevation data)

1600 166 122 59
1900 180 165 98
2200 188 189 123
2500 74 156 74
2800 123 189 83
3100 165 205 132
3300 255 255 255
 
To create the file, just type:
python colorize.py -exact_color_entry w001001.adf colorfile.clr out.png
or
python colorize.py -nearest_color_entry w001001.adf colorfile.clr out.png

Sample output with the -exact_color_entry option
Sample output with the default behaviour
It is possible to call the function from another python script just by typing

from colorize import raster2png
try:
    raster2png(file_name,colorfile_name,out_file_name,band,discrete)
except Exception, ex:
    print "Error: " + str(ex) 

If discrete is True, the output is with the exact color in the palette. If is False, with the continuous mode.

Monday, February 6, 2012

Raster classification with GDAL Python

There is a new entry about this topic, with a much more efficient code:


Classifying a raster means assigning a set of discrete values from the original continuous raster data. You can think about it as colorizing a raster file using data intervals, but it has many other uses, of course.

We will use the  Seamless Data Warehouse page from the USGS to get some data. Just open the following page and use the download section tools to get the data from the place you prefer:
http://seamless.usgs.gov/website/seamless/viewer.htm
The result is a file in the sub folder with a number.

So be sure that the GDAL python bindings are installed and execute the script:

#! /usr/bin/python

#Change the value with your raster filename here
raster_file = 'w001001.adf'
output_file = 'classified.tiff'

classification_values = [0,500,1000,1500,2000,2500,3000,3500,4000] #The interval values to classify
classification_output_values = [10,20,30,40,50,60,70,80,90] #The value assigned to each interval

from osgeo import gdal
from osgeo.gdalconst import *
import numpy
import struct

#Opening the raster file
dataset = gdal.Open(raster_file, GA_ReadOnly )
band = dataset.GetRasterBand(1)
#Reading the raster properties
projectionfrom = dataset.GetProjection()
geotransform = dataset.GetGeoTransform()
xsize = band.XSize
ysize = band.YSize
datatype = band.DataType

#Reading the raster values
values = band.ReadRaster( 0, 0, xsize, ysize, xsize, ysize, datatype )
#Conversion between GDAL types and python pack types (Can't use complex integer or float!!)
data_types ={'Byte':'B','UInt16':'H','Int16':'h','UInt32':'I','Int32':'i','Float32':'f','Float64':'d'}
values = struct.unpack(data_types[gdal.GetDataTypeName(band.DataType)]*xsize*ysize,values)

#Now that the raster is into an array, let's classify it
out_str = ''
for value in values:
    index = 0
    for cl_value in classification_values:
        if value <= cl_value:
            out_str = out_str + struct.pack('B',classification_output_values[index])
            break
        index = index + 1
#Once classified, write the output raster
#In the example, it's not possible to use the same output format than the input file, because GDAL is not able to write this file format. Geotiff will be used instead
gtiff = gdal.GetDriverByName('GTiff') 
output_dataset = gtiff.Create(output_file, xsize, ysize, 4)
output_dataset.SetProjection(projectionfrom)
output_dataset.SetGeoTransform(geotransform)

output_dataset.GetRasterBand(1).WriteRaster( 0, 0, xsize, ysize, out_str ) 
output_dataset = None

  • To classify the values is necessary to check for each classification value, until it fits to the interval. 
  • Look also how the projection is kept. The same for the geotransform.
  • If the input file format can be written by GDAL, it's possible to get it with:
    driver = dataset.GetDriver()
  • Look how the GDAL data types are managed. The names are different from the python types.
To see what has occurred,  the best is using QGIS.
Just open both files and you will see that the original file looks something like that once coloured with pseudocolor:
 And how the classified file looks like.

A nice exercice would be generating the coloured png's directly from the python script from a color file. Maybe the next post.