The plate is bad

Thursday, January 22, 2015

Calculating theoretical isotope patterns for mass spectrometry with PostgreSQL and pgchem::tigress

New year, new challenge: mass spectra.

To calculate the theoretical isotope pattern for a given molecule in PostgreSQL you need two things:

a cheminformatics toolkit to determine the element composition of the molecule and it's exact mass
a usably fast and accurate algorithm to infer the isotope pattern from the composition

For 1. I take OpenBabel, for 2. the Mercury7 algorithm as implemented in libmercury++.

Then, the two have to be made working together. Like so:

typedef struct
{
    size_t num_entries;
    double *mz;
    double *intensity;
    double *intensity_normalized;
    unsigned int *md;
} _ISOTOPE_PATTERN;

extern "C" _ISOTOPE_PATTERN *
ob_molfile_to_isotope_pattern(char *molfile, int charge, double normal)
{
    _ISOTOPE_PATTERN *retval;
    vector msa_mz;
    vector msa_abundance;
    vector composition;
    vector m;
    const double limit = 10e-30;
    OBMol mol;
    OBConversion conv;
    string tmpStr (molfile);
    istringstream molstream (tmpStr);
    double mass_diff, monoisotopic_mass, scale = 1.0, delta = numeric_limits::max();
    int i=0;

    //Initialize composition vector to all zeroes
    for(unsigned int i=0; i         composition.push_back(0);

    conv.SetInAndOutFormats("MDL",NULL);

    conv.Read(&mol, &molstream);

    if (mol.Empty ())
        return NULL;

    // Add missing hydrogens, if any
    mol.AddHydrogens(false, false); // All Hs, no PH correction

    //Iterate over atoms and update the composition vector accordingly
    FOR_ATOMS_OF_MOL(atom, mol)
    {
        switch (atom->GetAtomicNum())
        {
        case 1: // H
            composition[0]++;
            break;
        case 6: // C
            composition[1]++;
            break;
        case 7: // N
            composition[2]++;
            break;
        case 8: // O
            composition[3]++;
            break;
        case 16: // S
            composition[4]++;
            break;
        case 35: // Br
            composition[5]++;
            break;
        case 17: // Cl
            composition[6]++;
            break;
        case 53: // I
            composition[7]++;
            break;
        case 15: // P
            composition[8]++;
            break;
        case 9: // F
            composition[9]++;
            break;
        default: // Others ignored
            break;
        }
    }

    // Calculate isotope patterns with MERCURY7
    if(0 != mercury::mercury(msa_mz, msa_abundance, composition, charge, limit))
    {
        return NULL;
    }

    monoisotopic_mass = mol.GetExactMass();

    // Allocate return value and copy values
    retval = (_ISOTOPE_PATTERN*) calloc(1,sizeof(_ISOTOPE_PATTERN));

    retval->num_entries = msa_mz.size();
    retval->mz=(double*) calloc(msa_mz.size(), sizeof(double));
    retval->intensity=(double*) calloc(msa_abundance.size(), sizeof(double));
    retval->intensity_normalized=(double*) calloc(msa_abundance.size(), sizeof(double));
    retval->md=(unsigned int*) calloc(msa_abundance.size(), sizeof(unsigned int));

    copy(msa_mz.begin(), msa_mz.end(), retval->mz);
    copy(msa_abundance.begin(), msa_abundance.end(), retval->intensity);
    copy(msa_abundance.begin(), msa_abundance.end(), retval->intensity_normalized);

    for(std::vector::iterator it = msa_mz.begin(); it != msa_mz.end(); it++)
    {
        mass_diff = *it-monoisotopic_mass;

        m.push_back(mass_diff);

        if(abs(mass_diff) < delta)
        {
            delta = abs(mass_diff);
            scale = msa_abundance[i];
        }
        i++;
    }

    scale = normal / scale;

    for(i=0; retval->num_entries; i++)
    {
        retval->intensity_normalized[i] *= scale;
    }

    copy(m.begin(), m.end(), retval->md);

    return retval;
}

This parses a V2000 or V3000 molfile into a molecule object, iterates over the atoms and generates the elemental composition for Mercury7, calculates the exact mass and the theoretical isotope pattern. It then stores the m/z vector, the peak vector and the derived peak vector normalized to some value, e.g. 100%, and the mass distance relative to the base peak m/z value. All of this is stored in the C structure _ISOTOPE_PATTERN in order to transfer it from C++ to the C code of PostgreSQL.

Over to the PostgreSQL side of things :-)

PG_FUNCTION_INFO_V1 (pgchem_isotope_pattern);

Datum
pgchem_isotope_pattern (PG_FUNCTION_ARGS)
{
    _ISOTOPE_PATTERN *isotope_pattern = NULL;
    _ISOTOPE_PATTERN *pattern_copy = NULL;
    FuncCallContext *ctx=NULL;
    MemoryContext original_ctx=NULL;
    DECOMPRESSED_DATA *original_data=NULL;
    char *tmpMolfile=NULL;

    if(SRF_IS_FIRSTCALL())
    {
        MOLECULE *arg_molecule = PG_GETARG_MOLECULE_P (0);
        int32 arg_charge = PG_GETARG_INT32 (1);
        float8 arg_normal = PG_GETARG_FLOAT8 (2);
        ctx = SRF_FIRSTCALL_INIT();

        original_ctx = MemoryContextSwitchTo(ctx->multi_call_memory_ctx);

        if(arg_molecule->original_format==FORMAT_V2000 || arg_molecule->original_format==FORMAT_V3000)
        {
            original_data = decompress_data(CIPTR(arg_molecule),arg_molecule->compressed_sizeo, arg_molecule->sizeo);
            tmpMolfile = strdup(original_data->decompressed_data);
        }
        else
        {
            tmpMolfile = ob_smiles_to_V2000 (SMIPTR(arg_molecule));
        }

        isotope_pattern = ob_molfile_to_isotope_pattern(tmpMolfile, arg_charge, arg_normal);

        if(NULL!=tmpMolfile)
        {
            free(tmpMolfile);
        }

        if(NULL == isotope_pattern)
        {
            elog (ERROR, "Could not generate isotope pattern!");
            PG_RETURN_NULL();
        }

        pattern_copy = (_ISOTOPE_PATTERN*) palloc0(sizeof(_ISOTOPE_PATTERN));
        pattern_copy->mz = (double*) palloc0(isotope_pattern->num_entries*sizeof(double));
        pattern_copy->intensity = (double*) palloc0(isotope_pattern->num_entries*sizeof(double));
        pattern_copy->intensity_normalized = (double*) palloc0(isotope_pattern->num_entries*sizeof(double));
        pattern_copy->md = (unsigned int*) palloc0(isotope_pattern->num_entries*sizeof(unsigned int));

        pattern_copy->num_entries = isotope_pattern->num_entries;

        memcpy(pattern_copy->mz, isotope_pattern->mz, isotope_pattern->num_entries*sizeof(double));
        memcpy(pattern_copy->intensity, isotope_pattern->intensity, isotope_pattern->num_entries*sizeof(double));
        memcpy(pattern_copy->intensity_normalized, isotope_pattern->intensity_normalized, isotope_pattern->num_entries*sizeof(double));
        memcpy(pattern_copy->md, isotope_pattern->md, isotope_pattern->num_entries*sizeof(unsigned int));

        if(NULL != isotope_pattern->md)
            free(isotope_pattern->md);

        if(NULL != isotope_pattern->mz)
            free(isotope_pattern->mz);

        if(NULL != isotope_pattern->intensity)
            free(isotope_pattern->intensity);

        if(NULL != isotope_pattern->intensity_normalized)
            free(isotope_pattern->intensity_normalized);

        if(NULL != isotope_pattern)
            free(isotope_pattern);

        isotope_pattern = NULL;

        ctx->max_calls = pattern_copy->num_entries;
        ctx->user_fctx = pattern_copy;

        if(get_call_result_type(fcinfo, NULL, &ctx->tuple_desc) != TYPEFUNC_COMPOSITE)
        {
            elog (ERROR, "Calling the isotope pattern SRF in a context that does not expect tuples in return is not supported!");
        }

        BlessTupleDesc(ctx->tuple_desc);

        MemoryContextSwitchTo(original_ctx);
    }

    ctx = SRF_PERCALL_SETUP();

    isotope_pattern = ctx->user_fctx;

    if(ctx->call_cntr < ctx->max_calls)
    {
        HeapTuple rettuple = NULL;
        Datum *retvals;
        bool *retnulls;

        retvals = (Datum*) palloc0(4*sizeof(Datum));
        retnulls = (bool*) palloc0(4*sizeof(bool));

        retvals[0] = Float8GetDatum(isotope_pattern->mz[ctx->call_cntr]);
        retvals[1] = Float8GetDatum(isotope_pattern->intensity[ctx->call_cntr]);
        retvals[2] = Float8GetDatum(isotope_pattern->intensity_normalized[ctx->call_cntr]);
        retvals[3] = UInt8GetDatum(isotope_pattern->md[ctx->call_cntr]);

        retnulls[0] = false;
        retnulls[1] = false;
        retnulls[2] = false;
        retnulls[3] = false;

        rettuple = heap_form_tuple(ctx->tuple_desc, retvals, retnulls);

        pfree(retvals);
        pfree(retnulls);

        SRF_RETURN_NEXT(ctx, HeapTupleGetDatum(rettuple));
    }
    else
    {
        SRF_RETURN_DONE(ctx);
    }
}

This is a set returning function, meaning that it can return multiple rows in one call, i.e. it returns a complete table. Basically it retrieves the molecule argument from PostgreSQL, calls ob_molfile_to_isotope_pattern(), constructs a set of tuples and returns them to PostgreSQL. The basics of set returning functions can be found here and here.

Now we can declare the function to PostgreSQL:

CREATE OR REPLACE FUNCTION isotope_pattern(IN molecule, IN integer DEFAULT 0, IN double precision DEFAULT 100.0)
RETURNS TABLE("m/z" double precision, intensity double precision, intensity_normalized double precision, mdtbp integer) AS
'libpgchem', 'pgchem_isotope_pattern'
LANGUAGE c IMMUTABLE STRICT;

and use it:

select * from isotope_pattern('c1ccccc1I'::molecule) where intensity >= 0.01;

m/z	intensity	intensity_normalized	mdtbp
203.9436	0.934689933304048	100	0
204.946987674881	0.0634781699018538	6.79136124612619	1

Visualized it looks like this:

Nice. And everything (except the GUI) in the database.

Thursday, December 4, 2014

Remember the Therac!

Personal opinion warning!!!

Every time I read something like this, I immediately think of the famous Therac-25.

But let's start from the bottom...

"If you are deploying technology in your healthcare organization, ask yourself first if your people are ready. If they are not, you are setting the stage for failure."

This is so generic, Mr. Crounse should win a a prize. I'm sure, there is a prize somewhere for phrases like that.

How about:

"If you are deploying technology in your organization, ask yourself first if your people are ready. If they are not, you are setting the stage for failure."

Almost universally true but the gain of knowledge is essentially zilch.

But what really irks me is stuff like this (on the training of physicians):

"We spend our young adulthood immersed in the scholarly pursuit of a medical degree. We take four or more years in specialty training. [...] It’s all about following a certain process, a definitive kind of workflow"

Call me oldschool, but I expect a physician to be well trained and following certain, proven processes, when I lay my fate into her hands!

IT has already a major role in healthcare. Without advanced signal processing and medical imaging, MRI would be useless. Genome data simply cannot be analyzed without the help of computers. The human body can be simulated to some extent to reduce the need of testing new drugs on animals and human beings alike. Particle therapy needs absolute precision controls, provided by computers. And so on and on...

And what do I read?

"They are embracing cloud technologies to streamline IT resources and focus more of those limited resources on that which healthcare systems are all about—providing care to patients and increasingly, focusing on ways to improve population health and disease prevention."

OK, people are expensive, care is expensive so let's use IT to squeeze more out of the staff we have.

There is no single sentence about medical progress in this article. It's from top to bottom about cost-cutting.

I have to give credit to the overall assumption, that something good could be gained from the integration of health-care data, though. A lot of knowledge about how diseases work and what weapon works best to fight them with could be gained from a holistic view on all the data available.

Yet, in many countries, there is a strict regulation on medical data. And here I have to give credit again - it's all about people. People building systems, people using systems and people having trust in systems.

In 2010 and 2011, physicians manipulated medical data of their patients waiting for a liver transplant in order to push them higher up the waiting list. This so called Organspendeskandal led to the lowest trust and henceforth lowest number of potential organ donors in Germany since 2002.

I am a registered bone marrow donor at DKMS. Yet, we are only a scant 25 million worldwide. Already way to few. Guess what would happen, if there was a major security breach in this database? Or in any other large health data database?

"Information is no candy bar" they say in Bombardiers, (hilarious and highly recommended) If you eat it the candy bar its gone, but information can be stolen without notice.

Let's assume somebody would steal all the paper patient files from a hospital, or the insurance company. This could be noticed, because a) they are gone and b) somebody might ask what the four trucks full of paper are all about. Electronic files can be stolen without notice on an USB stick.

We, the people, still cannot build reliable and secure software systems at a large scale. SQL Injection is still #1 at OWASP. As it was 2010. 2007 it was on #2. Unbelievable.

Even in highly specialized and controlled areas, we fail, fail, fail.

So, where does this strict regulation on medical data come from again?

Whenever you work on hard or software for healthcare applications - please remember the Therac.

So much useful knowledge could be gained, but so much trust can be lost.

Personal opinion warning!!!

Wednesday, November 26, 2014

A working RAMDisk for PostgreSQL on Windows

Aside from the debate if this is useful or dangerous, I tried several free RAMDisks for Windows, Gavotte RamDisk, AMD Radeon RAMDisk and SoftPerfect RAM Disk - and none of them works well with PostgreSQL.

Either PostgreSQL refuses to use them at all for datafiles or it seems to work, but at some point in time the server tells you, that there is something wrong with the datafiles on the ramdrive and data corruption bites.

Apparently most ramdrives on Windows provide drive emulations only good enough to support simple file storage.

ImDisk is the first I tried, that seems to work well with PostgreSQL.

It's free, has a GUI and a command line, and the driver is properly signed for x64 Windows.

Friday, November 7, 2014

There are people := {a,b} for a ≠ b IN (SELECT people FROM world): Those who understand SQL, and those who have to wait.

Couldn't resist. ;->

I just pushed a view on an Oracle Database from 464 seconds down to 3 seconds (~ 157x speedup or 0.0065% of the original execution time) by removing a few pointless ORDER BYs and replacing one expensive JOIN with subqueries.

Thursday, October 30, 2014

ToroDB Adventures: Adding unimplemented commands

From

> db.createCollection("test")
{ "ok" : 0, "errmsg" : "Unimplemented command: create", "code" : 1000002 }

to

> db.createCollection("test")
{ "ok" : 1 }

in five simple steps:

1.) Find out how the command is called internally. toroDB tells you that in the message:

"Unimplemented command: create"

so it is "create".

2.) Look up the "create" command. Start in QueryCommandProcessor.java with the QueryCommandGroup enum which contains all enums of all known commands. Since it is an administration command, you'll find it in AdministrationQueryCommand.java.

3.) "create" does nothing at the moment, so add some code.
I just copied it from "createIndexes" so

create,

becomes now

create {
        @Override
        public void doCall(RequestBaseMessage queryMessage, BSONDocument query, ProcessorCaller caller) throws Exception {
            caller.create(query);
        }
    },

4.) Now, ProcessorCaller needs to know about the new command. It's enclosed in QueryCommandProcessor.java, so by adding

public void create(@Nonnull BSONDocument document) throws Exception {
queryCommandProcessor.create(document, messageReplier);
}

to ProcessorCaller AND the prototype

public void create(@Nonnull BSONDocument document, @Nonnull MessageReplier messageReplier) throws Exception;

to the enclosing QueryCommandProcessor class, make it known.

5.) Implement the command in ToroQueryCommandProcessor.java, which is the actual implementation of QueryCommandProcessor:

@Override
    public void create(BSONDocument document, MessageReplier messageReplier) {
    Map keyValues = new HashMap();
    keyValues.put("ok", MongoWP.OK);

    String collectionName = (String) document.getValue("create");
    Boolean capped = (Boolean) document.getValue("capped");
    Boolean autoIndexId = (Boolean) document.getValue("autoIndexId");
    Boolean usePowerOf2Sizes = (Boolean) document.getValue("usePowerOf2Sizes");
    Double size = (Double) document.getValue("size");
    Double max = (Double) document.getValue("max");

    BSONDocument reply = new MongoBSONDocument(keyValues);
    messageReplier.replyMessageNoCursor(reply);
    }

And that's pretty much it. As of now it just reads all allowed values from the command and acknowledges OK. But now everything is set to make it a "real" command if needed.

Monday, October 27, 2014

The case for map/reduce with relational databases

I was at pgconf.eu and held a lightning talk about map/reduce with PostgreSQL. Upfront, I was asked "Why do you want to do that anyway?" and my initial response was like, "Because I can.". :-)

But that got me thinking about the real case behind the idea. What is the heart of map/reduce? Citing from the original paper:

"MapReduce is a programming model and an associated implementation for processing and generating large data sets." Note the word "sets"?

"Users specify a map function that processes a key/value pair to generate a set of intermediate key/value pairs, and a reduce function that merges all intermediate values associated with the same intermediate key.", or, expressed in a more formal way

map (k1,v1)                -> list(k2,v2)
reduce (k2,list(v2))    -> list(v2)

Please note, that this is a bit more precise than the initial definition. The paper explains:

"I.e., the input keys and values are drawn from a different domain than the output keys and values. Furthermore, the intermediate keys and values are from the same domain as the output keys and values."

This is important, because the authors are introducing a domain transformation of the input data here. That is, in my opinion, already the heart of map/reduce.

Going back to the initial definition, this is basically what all RDBMS already do when processing parallel queries, be it by builtin ability or bolted on like with PL/Proxy + PostgreSQL: In the first step the input set is broken down to partitions, then the query runs on that partitions in parallel and produces intermediate result sets and finally that intermediate result sets are aggregated to the final result set. But the formal definition above adds a little twist, the domain transformation.

To clarify this, I'll use the canonical example, counting words in a text. The map function converts semi structured data, a text with lines of arbitrary length, into a well structured set of key (a word) and value (its count) tuples. This is the difference and the key to the power of map/redcue.
The ability to handle semi structured data which the relational model usually does not handle very well. (And I won't say unstructured data. Truly unstructured data is statistical noise.)

But modern RDBMS, especially PostgreSQL, often already have functions to transform semi structured data into relations and/or allow for user defined functions to extend their capabilities and that allows for running a map/reduce type job inside a RDBMS. Still, why would somebody want to do this?

1.) Integration
An awful lot of data is stored in relational models and will stay there. Simultaneously, especially for analytical workloads which become more and more important, the need for integrating relational and semi-structured data grows. Why handle them in different systems when one will do?
This decision of course heavily depends on the real world requirements. But rest assured that the datacenter guys who have to run the show will like to operate one database better than 2..n.

2.) Sets
Remember the word "sets" from the initial definition? And now the definition of a "relation" in a RDBMS:

"R is a relation on these n domains if it is a set of elements of the form (d1, d2, ..., dn) where dj ∈ Dj for each j=1,2,...,n." (E. F. Codd (Oct 1972). "Further normalization of the database relational model". "Data Base Systems". Courant Institute: Prentice-Hall. ISBN 013196741X.)

If a relation is a set of tuples with values from some domain D and map/reduce does domain transformation on key/value pairs (aka. tuples) what does that call for? Right, a very efficient set processor. Since relational DBMS are very efficient set processors by nature, they allow for
writing compact map/reduce functions that are also less error prone due to the declarative nature of SQL.

To clarify what I mean take a look at the following map and reduce functions for wordcount written for MongoDB in JavaScript from here:

var map = function() {
    var summary = this.summary;
    if (summary) {
        // quick lowercase to normalize per your requirements
        summary = summary.toLowerCase().split(" ");
        for (var i = summary.length - 1; i >= 0; i--) {
            // might want to remove punctuation, etc. here
            if (summary[i]) {      // make sure there's something
               emit(summary[i], 1); // store a 1 for each word
            }
        }
    }
};

var reduce = function( key, values ) {
    var count = 0;
    values.forEach(function(v) {
        count +=v;
    });
    return count;
}

db.so.mapReduce(map, reduce, {out: "word_count"})

PostgreSQL:

For the code, see my previous post.

Well, while it seems to require more code than MongoDB, there is a subtle difference. The most PostgreSQL code is standard boilerplate to write a set returning function and to make PL/Proxy work. Once you got that right, you usually never have to look back. The actual work is done in two lines of SQL:

SELECT TRIM(both from word),count(1) FROM (SELECT regexp_split_to_table(line, E'\\W+') as word FROM kjb) w GROUP BY word

and

SELECT word,sum(count) FROM map_kjb() AS (word text, count bigint) WHERE word != '' GROUP BY word

IMHO, the ability to express this with extensively tried and tested functions instead of having to implement them yourself combined with strong typing is worth so much, that one should give it a try before looking somewhere else. Granted, raw performance may become so paramount that an alternative technology might be called for, but if you already use PostgreSQL now there is another reason to like and not to leave it.

And this is it, the long answer I didn't already had ready at the conference.

P.S. It also allows to move computation instead of data around.

Monday, October 20, 2014

pgconf.eu 2014

On the worker nodes:

CREATE TABLE kjb
(
line text
);

CREATE OR REPLACE FUNCTION map_kjb()
RETURNS SETOF record AS
$BODY$
DECLARE r record;
BEGIN
FOR r IN (SELECT TRIM(both from word),count(1) FROM (SELECT regexp_split_to_table(line, E'\\W+') as word FROM kjb) w GROUP BY word) LOOP
RETURN NEXT r;
END LOOP;
RETURN;
END;
$BODY$
LANGUAGE plpgsql VOLATILE
COST 1000
ROWS 1000;

On the head node:

CREATE TYPE t_mr_wc AS (word text, count bigint);

CREATE OR REPLACE FUNCTION map_kjb()
RETURNS SETOF record AS
$BODY$
CLUSTER 'head';
RUN ON ALL;
$BODY$
LANGUAGE plproxy VOLATILE
COST 1000
ROWS 1000;

CREATE OR REPLACE FUNCTION reduce_kjb()
RETURNS SETOF t_mr_wc AS
$BODY$ DECLARE result public.t_mr_wc; BEGIN
FOR result IN select word,sum(count) from map_kjb() AS (word text,count bigint) where word != '' group by word LOOP RETURN NEXT result; END LOOP; END; $BODY$
LANGUAGE plpgsql VOLATILE
COST 100
ROWS 100;

1. Load data into tables.

2. Get word count:

SELECT * FROM reduce_kjb();

3. Get top 21 word counts:

SELECT * FROM reduce_kjb() ORDER BY 2 DESC LIMIT 21;

Wednesday, September 3, 2014

Using Code::Blocks with the Intel C/C++ compiler on Windows in five minutes

Code::Blocks apparently does not support the latest Intel C/C++ compiler out of the box. Trying to set up the necessary include paths for both the Visual Studio compiler and the Intel compiler leads to madness.

But there is an easier way:

Make two Intel compiler entries in Code::Blocks for the 32 and 64 bit version with the correct base directory, e.g. C:\Program Files (x86)\Intel\Composer XE\bin.
The 32 bit toolchain executables are prefixed ia32\, the 64 bit ones intel64\.

When you have installed the Intel compiler, there is an 'Command Prompt with Intel Compiler' entry in the start menu for 32 and 64 bit targets each. This opens a command prompt with all necessary paths and environment variables set correctly.

Just start codeblocks.exe from this command prompt and it inherits the environment. Then you can select the 32 or 64 bit Intel compiler option in the 'Project->Build options' and it works without further ado.

The only thing you cannot do is switching between 32 and 64 bit targets on the fly, you have to start Code::Blocks with the correct environment first.

Wednesday, April 16, 2014

NoSQL Matters Conference 2014, Cologne

I made it on the list of presenters at the NoSQL Matters Conference 2014 in Cologne:

Single Point of Entry: Integrating relational and semi-structured data with PostgreSQL

Thursday, March 20, 2014

Eclipse compiler produces faster FP code?

While testing this very simple FP code in Java:

package the.plateisbad;

public class Simple {

public static void main(String[] args) {
    final long arg_flops;
    double i,x = 0.0d, y = 0.0d;
    final long start, end;

    if (args.length != 2) {
      System.out.println("Usage: the.plateisbad.Simple ");
      return;
    }

    arg_flops = Long.parseLong(args[0]);
    y = Long.parseLong(args[1]);

    System.out.println("Thinking really hard for " + arg_flops + " flops...");

    start = System.currentTimeMillis();

    for (i = 0; i < arg_flops; i++) {
      x = i * y;
    }

    end = System.currentTimeMillis();

    System.out.println("We calculated: " + x + " in " +(end-start)+ " ms");
}
}

I've stumbled over the fact, that it runs considerably faster when compiled with the Eclipse ECJ compiler compared to a standard javac.

With ECJ, executed with JDK 1.7:

java -server the.plateisbad.Simple 1000000000 3

Thinking really hard for 1000000000 flops...
We calculated: 2.999999997E9 in 1964 ms

With javac, executed with JDK 1.7:

java -server the.plateisbad.Simple 1000000000 3

Thinking really hard for 1000000000 flops...
We calculated: 2.999999997E9 in 3514 ms

With the new JDK 1.8, there is no noticeable difference between javac and ECJ:

java -server the.plateisbad.Simple 1000000000 3

Thinking really hard for 1000000000 flops...
We calculated: 2.999999997E9 in 3727 ms

but it is always the slowest of the three. The Bytecode tells me that ECJ builds a tail controlled loop which loops while i is < arg_flops:

64: invokestatic #52                 // Method java/lang/System.currentTimeMillis:()J
      67: lstore        9
      69: dconst_0
      70: dstore_3
      71: goto          84
      74: dload_3
      75: dload         7
      77: dmul
      78: dstore        5
      80: dload_3
      81: dconst_1
      82: dadd
      83: dstore_3
      84: dload_3
      85: lload_1
      86: l2d
      87: dcmpg
      88: iflt          74
      91: invokestatic #52                 // Method java/lang/System.currentTimeMillis:()J

while javac builds a head controlled loop that exits if i >= arg_flops:

      67: invokestatic #13                 // Method java/lang/System.currentTimeMillis:()J
      70: lstore        9
      72: dconst_0
      73: dstore_3
      74: dload_3
      75: lload_1
      76: l2d
      77: dcmpg
      78: ifge          94
      81: dload_3
      82: dload         7
      84: dmul
      85: dstore        5
      87: dload_3
      88: dconst_1
      89: dadd
      90: dstore_3
      91: goto          74
      94: invokestatic #13                 // Method java/lang/System.currentTimeMillis:()J

And ECJ uses StringBuffer while javac uses StringBuilder for the String operations, but since these are not in the loop, that should not make any difference.

Does somebody know what is going on here?

UPDATE: This seems to be an anomaly. SciMark 2.0 shows now significant differences between ECJ and javac and jdk1.7 and jdk1.8 - with 1.8 being slightly faster.

Friday, March 7, 2014

It's always good to have more choice

http://www.moldb.net/helium.html

Wednesday, March 5, 2014

A suggestion to all architects of high-security buildings ;->

Please, don't put the restrooms outside the security gates!

Wednesday, February 12, 2014

Arbitrary parallel (well, almost) ad-hoc queries with PostgreSQL

Contemporary PostgreSQL lacks the ability to run single queries on multiple cores, nodes etc., i.e. it lacks automatic horizontal scaling. While this seems to be under development, what can be done today?

PL/Proxy allows database partitioning and RUN ON ALL executes the function on all nodes simultaneously. PL/Proxy is limited to the partitioned execution of functions and has good reasons for this design. But PostgreSQL can execute dynamic SQL within functions, so let's see how far we can get.

Worker function (on all worker nodes):

CREATE OR REPLACE FUNCTION parallel_query(statement text)
RETURNS SETOF record AS
$BODY$
DECLARE r record;
BEGIN
IF lower($1) LIKE 'select%' THEN
FOR r IN EXECUTE $1 LOOP
RETURN NEXT r;
END LOOP;
ELSE
RAISE EXCEPTION 'Only queries allowed';
END IF;
END
$BODY$
LANGUAGE plpgsql VOLATILE;

Proxy function (on all head nodes):

CREATE OR REPLACE FUNCTION parallel_query(statement text)
RETURNS SETOF record AS
$BODY$
CLUSTER 'head'; RUN ON ALL;
$BODY$
LANGUAGE plproxy VOLATILE;

Table (on all worker nodes):

CREATE TABLE users
(
username text NOT NULL,
CONSTRAINT users_pkey PRIMARY KEY (username)
)
WITH (
OIDS=FALSE
);

With 10000 rows in two nodes, partitioned by username hash (~5000 on each node)

select * from parallel_query('select * from users') as (username text);

returns all 10000 rows. Since the nodes can be databases within the same server, there is no need for additional hardware, server installations etc. But if more performance is required in the future, adding more boxes is possible.

All it takes is logical partitioning and a bit of PL/pgSQL if you really need to run parallel queries.

There are some differences though. Take the following query:

select * from execute_query('select max(username) from users') as (username text);

"user_name_9995"
"user_name_9999"

It now returns two maximums, one for each partition. To get the expected result a second stage is needed:

select max(username) from execute_query('select max(username) from users') as (username text);

"user_name_9999"

The same applies for other aggregation functions like avg() etc.

The proxy function can finally be hidden in a VIEW:

CREATE OR REPLACE VIEW "users" AS select * from parallel_query('select * from users') as (username text);

Thursday, January 23, 2014

From palloc() to palloc0()

I've just replaced all palloc()/memset() pairs for getting zeroed-out memory with palloc0(). I doubt that this will show a significant speedup, but it fits better into the PostgreSQL memory model and reduces the lines of code.

E.g.:

retval = (text *) palloc (len + VARHDRSZ);
memset(retval,0x0,len + VARHDRSZ);

becomes:

retval = (text *) palloc0 (len + VARHDRSZ);

The changes have been comitted to Github.

Tuesday, December 17, 2013

It compiles...

>But, since Postgresql 9.3 just came out, I'll see if I can manage at least to compile pgchem::tigress against 9.3, OpenBabel 2.3.2 and Indigo 1.1.11 before 2013 ends...

Short update: I have merged the two pull requests from Steffen Neumann and Björn Grüning into the repository and then made some minor corrections. LANGUAGE C (case insensitive but without the '') solves the one problem, replacing int4 with int32 the other. Actually this was no gcc issue, but the PostgreSQL guys have apparently decided to remove int4 as an PostgreSQL internal datatype from 9.2 to 9.3.

I had always wondered why there were so many duplicate but internally identical (e.g. int4 (number of bytes) and int32 (number of bits)) datatypes anyway. Probably some legacy...

Also, pgchem_tigress can now be installed as an relocatable extension with the CREATE EXTENSION mechanism. No need to run various installation scripts anymore (but still supported).

I still need to put an 'install' target into the Makefile to fully integrate pgchem_tigress into the extension system. Currently, the shared objects still have to be copied by hand before calling CREATE EXTENSION The PGXS Makefile already takes care of that - there is progress towards a new release.

Thursday, October 24, 2013

Natural vs. Chemical

Last week I attended a very interesting lecture about metabolic pathways.

Maybe it would help all the people who like to make a firm distinction between 'natural products' -> good and 'chemical products' -> bad to attend such a lecture from time to time...

Wednesday, September 18, 2013

Long time no see

Hi all,

the development of pgchem::tigress - and this Blog - have been quiet for some time now.

Professionally, I'm currently moving into two new spheres of activity, document management for product registration (i.e. license to sell) and bioinformatics, and this occupies most of my time. That my wife had to go to the hospital for a few months (nothing life threatening, but quite hindering) didn't help much either. ;->

But, since Postgresql 9.3 just came out, I'll see if I can manage at least to compile pgchem::tigress against 9.3, OpenBabel 2.3.2 and Indigo 1.1.11 before 2013 ends...

Wednesday, September 26, 2012

Bekloppt

Drug discovery as a Facebook game? WTF...

Friday, December 23, 2011

Christmas presents

Jérôme Pansanel has completed the new serialization code for mychem and pgchem, so there is no need to handle stereo and non-stereo queries differently anymore. I have moved the index functions to GCC's vector extensions where applicable, and the first result is that index build times have been roughly cut by half while substructure search times have improved, but not that much.

Index build times

System	Index build time
pgchem with OpenBabel or Indigo	352137 ms
pgchem with OpenBabel or Indigo vectorized	192815 ms

OpenBabel with binary storage and FP2 fingerprint vectorized

Query	Hits	no Index	Hits	with Index
GH2	4840	98416 ms	4840	17044 ms
GH7	260	94053 ms	260	1564 ms
GH13	580	113690 ms	580	34504 ms
GH16	26910	99365 ms	26910	55154 ms

Merry Christmas and a happy new year!

Saturday, December 17, 2011

Benchmark data published

The base data used in the previous benchmarks can be found here: pubchem_0_100000.

Thursday, December 15, 2011

Selected GH17 results for 10^6 structures

GH17 substructure search speed

OpenBabel with binary+SMILES storage and FP2 fingerprint

Query	Hits	no Index	Hits	with Index
GH2	4840	108807 ms	4840	21164 ms
GH7	260	105050 ms	260	1934 ms
GH13	580	118978 ms	580	52416 ms
GH16	26910	109886 ms	26910	64742 ms

Indigo with binary storage and ext+sub fingerprint

Query	Hits	no Index	Hits	with Index
GH2	4840	213075 ms	4840	27887 ms
GH7	410	178963 ms	410	4451 ms
GH13	580	251938 ms	580	39134 ms
GH16	27100	172534 ms	27100	80523 ms

Bingo 1.7beta2 with molfiles as text storage

Query	Hits	no Index	Hits	with Index
GH2	4710	647889 ms	4710	21733 ms
GH7	410	538784 ms	410	6658 ms
GH13	580	675093 ms	580	12418 ms
GH16	27100	528891 ms	27100	28541 ms

Index build times

System	Index build time
pgchem with OpenBabel or Indigo	352137 ms
Bingo	3458681ms

Again, Bingo without it's index is apparently killed by the overhead of parsing text into the internal molecule format. With index it's a mixed bag, while it shines at GH13 and GH16, pgchem is about equal or faster at GH2 and GH7.

Wednesday, December 14, 2011

GH17 results

Mikhail Rybalkin from GGA Software asked me to do this, so here it is...

The GH17 test queries used

There is a small set of queries used in article Chemical substructure search in SQL by Golovin and Henrick. These queries were lately reused in other articles:

GH1 ONC1CC(C(O)C1O)[n]2cnc3c(NC4CC4)ncnc23
GH2 Nc1ncnc2[n]cnc12
GH3 CNc1ncnc2[n](C)cnc12
GH4 Nc1ncnc2[n](cnc12)C3CCCC3
GH5 CC12CCC3C(CCC4=CC(O)CCC34C)C1CCC2
GH6 OC2=CC(=O)c1c(cccc1)O2
GH7 Nc1nnc(S)s1
GH8 C1C2SCCN2C1
GH9 CP(O)(O)=O
GH10 CCCCCP(O)(O)=O
GH11 N2CCC13CCCCC1C2Cc4c3cccc4
GH12 s1cncc1
GH13 C34CCC1C(CCC2CC(=O)CCC12)C3CCC4
GH14 CCCCCCCCCCCP(O)(O)=O
GH15 CC1CCCC1
GH16 CCC1CCCC1
GH17 CCCC1CCCC1

GH17 substructure search speed

OpenBabel with binary+SMILES storage and FP2 fingerprint

Query	Hits	no Index	Hits	with Index
GH1	0	9517 ms	0	25 ms
GH2	484	8519 ms	484	111 ms
GH3	63	8632 ms	63	43 ms
GH4	5	8950 ms	5	48 ms
GH5	36	10020 ms	36	78 ms
GH6	0	8696 ms	0	32 ms
GH7	26	8279 ms	26	31 ms
GH8	170	8454 ms	170	56 ms
GH9	348	8068 ms	348	71 ms
GH10	36	8820 ms	36	21 ms
GH11	66	9113 ms	66	52 ms
GH12	831	7920 ms	831	124 ms
GH13	58	9864 ms	58	448 ms
GH14	4	9998 ms	4	36 ms
GH15	3008	8549 ms	3008	555 ms
GH16	2691	8665 ms	2691	501 ms
GH17	2290	8717 ms	2290	560 ms

Indigo with binary storage and ext+sub fingerprint

Query	Hits	no Index	Hits	with Index
GH1	0	28161 ms	0	32 ms
GH2	484	19188 ms	484	187 ms
GH3	68	20498 ms	68	78 ms
GH4	5	23388 ms	5	33 ms
GH5	36	25887 ms	36	62 ms
GH6	0	21034 ms	0	31 ms
GH7	41	16302 ms	41	31 ms
GH8	170	16629 ms	170	78 ms
GH9	373	14005 ms	373	98 ms
GH10	37	16881 ms	37	21 ms
GH11	66	24091 ms	66	78 ms
GH12	829	14842 ms	829	210 ms
GH13	58	24470 ms	58	133 ms
GH14	4	21403 ms	4	36 ms
GH15	3047	15817 ms	3047	749 ms
GH16	2710	16767 ms	2710	732 ms
GH17	2304	17524 ms	2304	788 ms

Bingo 1.7beta2 with molfiles as text storage

Query	Hits	no Index	Hits	with Index
GH1	0	70277 ms	0	125 ms
GH2	471	56821 ms	471	156 ms
GH3	68	57754 ms	68	125 ms
GH4	5	60067 ms	5	125 ms
GH5	36	65586 ms	36	140 ms
GH6	79	57188 ms	79	125 ms
GH7	41	52134 ms	41	125 ms
GH8	170	51685 ms	170	140 ms
GH9	373	47613 ms	373	138 ms
GH10	37	49961 ms	37	110 ms
GH11	66	61176 ms	66	125 ms
GH12	774	50281 ms	829	156 ms
GH13	58	61108 ms	58	156 ms
GH14	4	53636 ms	4	140 ms
GH15	3047	50213 ms	3047	343 ms
GH16	2710	51227 ms	2710	327 ms
GH17	2304	51495 ms	2304	362 ms

Fingerprint efficiency (with regard to false positives)

FP2

Query	Candidates screened	Hits matched	false positives	Efficiency
GH1	0	0	0	1.000
GH2	485	484	1	0.998
GH3	69	63	6	0.913
GH4	37	5	32	0.135
GH5	120	36	84	0.300
GH6	79	0	79	0.000
GH7	41	26	15	0.634
GH8	177	170	7	0.960
GH9	377	348	29	0.923
GH10	37	36	1	0.973
GH11	123	66	1	0.537
GH12	831	831	0	1.000
GH13	1346	58	1288	0.043
GH14	20	4	16	0.200
GH15	3760	3008	752	0.800
GH16	3305	2691	614	0.814
GH17	3305	2290	715	0.762

ext+sub

Query	Candidates screened	Hits matched	false positives	Efficiency
GH1	0	0	0	1.000
GH2	484	484	1	1.000
GH3	68	68	0	1.000
GH4	5	5	0	1.000
GH5	47	36	11	0.766
GH6	0	0	0	1.000
GH7	41	41	0	1.000
GH8	170	170	0	1.000
GH9	373	373	0	1.000
GH10	37	37	0	1.000
GH11	66	66	0	1.000
GH12	829	829	0	1.000
GH13	259	58	201	0.224
GH14	20	4	16	0.200
GH15	3061	3047	14	0.995
GH16	2720	2710	10	0.996
GH17	2720	2304	416	0.847

Index build times

System	Index build time
pgchem with OpenBabel or Indigo	25690 ms
Bingo	336319 ms

Indigo's ext+sub fingerprint is truly more selective than FP2. Still, OpenBabel with binary storage shows the better prformance because of its ~~faster matcher~~ lower query overhead.

Also, the result for GH3, GH7, GH9, GH12, GH15, GH16, and GH17 are different between OpenBabel and Indigo, and Bingo finds 79 hits for GH6 where pgchem finds zero.

Index building on pgchem is 13 times faster than Bingo, but since pgchem (currently) does not support features like tautomer searching or SMARTS searching with index support this comparison is a bit like apples and oranges.

The slow performance of Bingo without index, comparable to pgchem without binary storage, is quite likely a result of the storage of molecules in textual representation. Parsing text to binary molecules is a first class performance killer. Unfortunately, there is no way to convert molecules into native format directly with Bingo for PostgreSQL, but Bingo does the conversion implicitly when building the index.

Friday, December 9, 2011

OBMol de-/serialization revisited

The current serialization/deserialization mechanism in pgchem and mychem does not preserve stereochemistry of OBMol objects properly. As Tim Vandermeersch wrote:

The OBChiralData isn't used anymore. Also the functions OBAtom::IsClockwise, and OBAtom::IsAntiClockwise are obsolate. Instead, you should serialize the OBCisTransStereo and OBTetrahedralStereo data objects.

Fortunately (for me, since I don't have the time at the moment), Jérôme Pansanel has:

...started the serialization of the OBCisTransStereo and OBTetrahedralStereo objects.

For the time being, I have tried the following workaround and it seems to be working well. First, I have removed now unneccessary code from the unserialization:

bool unserializeOBMol(OBBase* pOb, const char *serializedInput)
{
  OBMol* pmol = pOb->CastAndClear<OBMol>();
  OBMol &mol = *pmol;
  unsigned int i,natoms,nbonds;

  unsigned int *intptr = (unsigned int*) serializedInput;

  ++intptr;

  natoms = *intptr;

  ++intptr;

  nbonds = *intptr;

  ++intptr;

  _ATOM *atomptr = (_ATOM*) intptr;

  mol.ReserveAtoms(natoms);

  OBAtom atom;
  int stereo;

  for (i = 1; i <= natoms; i++) {
    atom.SetIdx(atomptr->idx);
    atom.SetHyb(atomptr->hybridization);
    atom.SetAtomicNum((int) atomptr->atomicnum);
    atom.SetIsotope((unsigned int) atomptr->isotope);
    atom.SetFormalCharge((int) atomptr->formalcharge);
    stereo = atomptr->stereo;

    if(stereo == 3) {
      atom.SetChiral();
    }

    atom.SetSpinMultiplicity((short) atomptr->spinmultiplicity);

    if(atomptr->aromatic != 0) {
      atom.SetAromatic();
    }

    if (!mol.AddAtom(atom)) {
      return false;
    }

    atom.Clear();

    ++atomptr;
  }

  _BOND *bondptr = (_BOND*) atomptr;

  unsigned int start,end,order,flags;

  for (i = 0;i < nbonds;i++) {
    flags = 0;

    start = bondptr->beginidx;
    end = bondptr->endidx;
    order = (int) bondptr->order;

    if (start == 0 || end == 0 || order == 0 || start > natoms || end > natoms) {
      return false;
    }

    order = (unsigned int) (order == 4) ? 5 : order;

    stereo = bondptr->stereo;

    if (stereo) {
      if (stereo == 1) {
        flags |= OB_WEDGE_BOND;
      }
      if (stereo == 6) {
        flags |= OB_HASH_BOND;
      }
    }

    if (bondptr->aromatic != 0) {
      flags |= OB_AROMATIC_BOND;
    }

    if (!mol.AddBond(start,end,order,flags)) {
      return false;
    }

    ++bondptr;
  }

  intptr = (unsigned int*) bondptr;

  mol.SetAromaticPerceived();
  mol.SetKekulePerceived();

  return true;
}

Then, when matching, I check if the query might contain tetrahedral stereo. If yes, I build the target OBMol fresh from SMILES, if no directly from the serialized object:

 if (strchr (querysmi, '@') != NULL)
    {
        //Match against an OBMol generated from SMILES
    }
    else
    {
         //Match against an OBMol deserialized from binary
    }

Tuesday, December 6, 2011

First Light

As promised, here the first results for substructure searching c1ccccc1Cl on the first 100000 compounds from Pubchem: select * from pubchem.compound where compound >= 'c1ccccc1Cl'::molecule

Substructure search speed

Rank	Build	Storage	Fingerprint	Hits	no Index	Hits	with Index
1	OpenBabel	binary+SMILES	FP2	8070	9236 ms	8067	936 ms
2	Indigo	binary	ext+sub	8049	24821 ms	8049	2418 ms
3	OpenBabel	SMILES	FP2	8070	57971 ms	8067	5432 ms

I've checked why the OpenBabel FP2 fingerprint eliminates three structures that otherwise would pass: Using the VF2 OBIsomorphismMapper instead of the OBSmartsPattern, it's also 8067 without index. But it's about four times slower, 90526 ms without index, 8798 with index.
The structures in question are: 80944, 83450 and 99925 and I'm pretty sure it's caused by differences in aromaticity detection.

Stereochemistry in substructure searches

Check	Query	Expected	Indigo binary	OpenBabel SMILES	OpenBabel binary
R/S different	select 'C[C@H]([C@@H](C(=O)O)N)O'::molecule <= 'O=C(O)[C@H](N)[C@@H](O)C'::molecule	false	pass	fail	fail
R/S same	select 'C[C@H]([C@@H](C(=O)O)N)O'::molecule <= 'C[C@H]([C@@H](C(=O)O)N)O'::molecule	true	pass	pass	pass
E/Z different	select 'C(=C\Cl)\Cl'::molecule <= 'C(=C/Cl)\Cl'::molecule	false	pass	fail	fail

OpenBabel fails the 'E/Z different' and 'R/S different' checks, but these are known issues up to version 2.3.1. ~~More disturbing are the issues with 'R/S different'. For SMILES I can reproduce it with obgrep, so it's not my code causing it.~~

Indigo has no obvious issues with matching R/S and E/Z stereochemistry. ~~Speedwise, OpenBabel with binary storage would be the leader of the pack, but has inconsistent behaviour compared with it's SMILES storage sibling.~~ So the serialization/deserialization code clearly is missing something. But I've found a simple workaround: If the query contains chirality information it uses SMILES, otherwise binary. Speedwise, OpenBabel with binary storage now is the leader of the pack.

Fingerprint efficiency

Fingerprint type	Candidates screened	Hits matched	false positives	Efficiency
ext+sub	8090	8049	41	0.995
FP2	8145	8067	78	0.99

Pretty close.

Wednesday, November 30, 2011

What's up, doc?

OK, I have now a working (i.e. not crashing & giving meaningful results) hybrid version of pgchem::tigress with OpenBabel and Indigo, where the OpenBabel functions in the search engine have been replaced by Indigo functionality.

Why?

To see if it can be done :-)
To learn more about programming with Indigo
To see which is faster
Indigo has better reaction support as it seems

Interestingly I could keep the external interface almost stable. As only a dozen or so functions outside the search engine are affected by the change, I think with some #IFDEFs I can make a common codebase that compiles either to 100% OpenBabel or the hybrid version.

Monday, November 14, 2011

R-Matrix implementation with Indigo

Long time no see.

I have been trying programming with Indigo recently and here is my first, straightforward (to avoid the term 'naive') implementation of an Ugi like R-Matrix (Gasteiger/Engel, Chemoinformatics, 2003, Wiley-VCH, p. 185ff.) for adding make/break/change bond detection to its reaction mapper:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "indigo.h"

static inline int mp(const int column, const int row, const int maxRows)
{
    return column-1+((row-1)*maxRows);
}

int main(int argc, char *argv[])
{

    int index, reaction, reactant, product, atom, bond, atomIter, reactantIter, productIter, neighborIter, totalAtoms, mappingNumber, targetMappingNumber, bondMakeBreak;
    int *bMatrix, *eMatrix;//, *rMatrix;

    totalAtoms=0;

    qword session = indigoAllocSessionId();

    indigoSetSessionId(session);

    printf("%s\n",indigoVersion());

    reaction = indigoLoadReactionFromFile("c:/temp/testrxn.rxn");

    //indigoFoldHydrogens(handle);

    indigoDearomatize(reaction);

    //printf("1:%s\n",indigoGetLastError());

    indigoAutomap(reaction,"discard");

    reactantIter = indigoIterateReactants(reaction);

    while((reactant = indigoNext(reactantIter)))
    {
        totalAtoms+=indigoCountAtoms(reactant);
    }

    indigoFree(reactantIter);

    bMatrix = (int*) calloc(totalAtoms*totalAtoms,sizeof(int));
    eMatrix = (int*) calloc(totalAtoms*totalAtoms,sizeof(int));
    //rMatrix = (int*) calloc(totalAtoms*totalAtoms,sizeof(int));

    reactantIter = indigoIterateReactants(reaction);

    while((reactant = indigoNext(reactantIter)))
    {
        atomIter = indigoIterateAtoms(reactant);
        while((atom = indigoNext(atomIter)))
        {
            mappingNumber = indigoGetAtomMappingNumber(reaction,atom);
            //printf("%i\n",mappingNumber);
            //indigoGetExplicitValence(atom, &valence);
            //printf("%i\n",valence);
            //bMatrix[mp(mappingNumber, mappingNumber, totalAtoms)]=valence;

            neighborIter = indigoIterateNeighbors(atom);

            while((atom = indigoNext(neighborIter)))
            {
                targetMappingNumber = indigoGetAtomMappingNumber(reaction,atom);

                index = mp(mappingNumber, targetMappingNumber, totalAtoms);

                if(bMatrix[index]==0)
                {
                    bond=indigoBond(atom);
                    bMatrix[index]=indigoBondOrder(bond);
                }
            }

            //printf("\n");

            indigoFree(neighborIter);
        }
        indigoFree(atomIter);
    }

    indigoFree(reactantIter);

    productIter = indigoIterateProducts(reaction);

    while((product = indigoNext(productIter)))
    {
        atomIter = indigoIterateAtoms(product);
        while((atom = indigoNext(atomIter)))
        {
            mappingNumber = indigoGetAtomMappingNumber(reaction,atom);
            //printf("%i\n",mappingNumber);
            //indigoGetExplicitValence(atom, &valence);
            //printf("%i\n",valence);
            //eMatrix[mp(mappingNumber, mappingNumber, totalAtoms)]=valence;

            neighborIter = indigoIterateNeighbors(atom);

            while((atom = indigoNext(neighborIter)))
            {
                targetMappingNumber = indigoGetAtomMappingNumber(reaction,atom);

                index = mp(mappingNumber, targetMappingNumber, totalAtoms);

                //printf("%i",indigoBondOrder(bond));
                if(eMatrix[index]==0)
                {
                    bond=indigoBond(atom);
                    eMatrix[index]=indigoBondOrder(bond);
                }
            }

            //printf("\n");

            indigoFree(neighborIter);
        }
        indigoFree(atomIter);
    }

    indigoFree(productIter);

    for(int i=0; i<(totalAtoms*totalAtoms); i++)
    {
        bondMakeBreak=eMatrix[i]-bMatrix[i];

        //rMatrix[i]=bondMakeBreak;

        if(bondMakeBreak>0)
        {
            if(bMatrix[i]>0)
                printf("change: %i to %i\n",(i / totalAtoms)+1,(i % totalAtoms)+1);
            else
                printf("make: %i to %i\n",(i / totalAtoms)+1,(i % totalAtoms)+1);
        }
        else if(bondMakeBreak<0)
        {
            if(bMatrix[i]>1)
                printf("change: %i to %i\n",(i / totalAtoms)+1,(i % totalAtoms)+1);
            else
                printf("break: %i to %i\n",(i / totalAtoms)+1,(i % totalAtoms)+1);
        }
    }

    //printf("2:%s\n",indigoGetLastError());

    indigoSaveRxnfileToFile(reaction,"c:/temp/testrxn_mapped.rxn");

    //printf("3:%s\n",indigoGetLastError());

    indigoFree(reaction);
    indigoReleaseSessionId(session);
    free(bMatrix);
    free(eMatrix);
    //free(rMatrix);

    return 0;
}

At the moment it has the following shortcomings:

50% too much memory consumption, since only half of the R-Matrix would suffice
Each bond is found twice (Atom1 to Atom2 and Atom2 to Atom1)
The make/break/change info is not added to the mapped reaction file

Have fun!