GtEncseq

A software library for sequence storage and access
The GtEncseq is a portable software component providing an efficient data structure for storing multiple biological sequences of variable alphabet size, with customizable character transformations, wildcard support and an assortment of internal representations optimized for different distributions of wildcards and sequence lengths.

Storage of and access to metadata like sequence descriptions is supported. Implemented in C, the GtEncseq provides a variety of methods for random and sequential access to characters and substrings (including different reading directions) using an object-oriented interface. It is also accessible from several scripting languages (Ruby, Python, Lua).

Documentation
A documentation of the GtEncseq C API can be found here.

Example code
The following C (first code) and Python (second code) example code illustrates how the contents of the GtEncseq can be accessed via the API.

#include "genometools.h"

int main(int argc, const char *argv[])
{
GtError *err;
GtEncseq *es;
GtEncseqLoader *el;
GtEncseqReader *er;
unsigned long i,
tmp,
desclen;
const char *desc;
char *buf;

/* initialize GenomeTools library */
gt_lib_init();

/* create GtEncseqLoader */
el = gt_encseq_loader_new();

/* only accept indexes with description support */
gt_encseq_loader_disable_autosupport(el);
gt_encseq_loader_require_description_support(el);

/* load the file */
err = gt_error_new();
es = gt_encseq_loader_load(el, argv[1], err);
gt_encseq_loader_delete(el);

/* error checking */
if (!es) {
printf("%s", gt_error_get(err));
exit(EXIT_FAILURE);
}

/* get number of sequences, e.g. 6 */
tmp = gt_encseq_num_of_sequences(es);

/* get total concatenated length, e.g. 156 */
tmp = gt_encseq_total_length(es);

/* get description */
desc = gt_encseq_description(es, &desclen, 3);

/* extend sequence by their virtual reverse complement */
if (gt_alphabet_is_dna(gt_encseq_alphabet(es))) {
gt_encseq_mirror(es, err);
}

/* get number of sequences, should now be 12 */
tmp = gt_encseq_num_of_sequences(es);

/* get total concatenated length, should now be 313 */
tmp = gt_encseq_total_length(es);

/* remove reverse complement */
if (gt_encseq_is_mirrored(es))
gt_encseq_unmirror(es);

/* get starting position of sequence 3 in the encoded sequence */
tmp = gt_encseq_seqstartpos(es, 3);

/* output sequence 3 as FASTA */
printf(">%.*s\n", (int) desclen, desc);
for (i = 0; i < gt_encseq_seqlength(es, 3); i++) {
putc(gt_encseq_get_decoded_char(es, tmp + i,
GT_READMODE_FORWARD), stdout);
}
printf("\n");

/* alternative: using a reader */
printf(">%.*s\n", (int) desclen, desc);
er = gt_encseq_create_reader_with_readmode(es, GT_READMODE_FORWARD, tmp);
for (i = 0; i < gt_encseq_seqlength(es, 3); i++) {
putc(gt_encseq_reader_next_decoded_char(er), stdout);
}
gt_encseq_reader_delete(er);
printf("\n");

/* alternative: using a buffer (needs extra space) */
printf(">%.*s\n", (int) desclen, desc);
buf = calloc(gt_encseq_seqlength(es, 3) + 1, sizeof (char));
gt_encseq_extract_decoded(es, buf, tmp, tmp + gt_encseq_seqlength(es, 3));
printf("%s\n", buf);
free(buf);

/* cleanup */
gt_error_delete(err);
gt_encseq_delete(es);
gt_lib_clean();
}

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from gt.core.encseq import Encseq, EncseqLoader, EncseqReader
from gt.core.readmode import *
import sys

el = EncseqLoader()
es = el.load(sys.argv[1])

# get number of sequences, e.g. 6
tmp = es.num_of_sequences()

# get total concatenated length, e.g. 156
tmp = es.total_length()

# get description
desc = es.description(3)

# extend sequence by their virtual reverse complement
if es.alphabet().is_dna():
es.mirror()

# get number of sequences, should now be 12
tmp = es.num_of_sequences()

# get total concatenated length, should now be 313
tmp = es.total_length()

# remove reverse complement
if es.is_mirrored():
es.unmirror()

# get starting position of sequence 3 in the encoded sequence
tmp = es.seqstartpos(3)

# output sequence 3 as FASTA
print(">%s" % desc)
for i in range(es.seqlength(3)):
sys.stdout.write(es.get_decoded_char(tmp+i, FORWARD))
print

# alternative: using a reader
print(">%s" % desc)
er = es.create_reader_with_readmode(FORWARD, tmp)
for i in range(es.seqlength(3)):
sys.stdout.write(er.next_decoded_char())
print

# alternative: using a buffer (needs extra space)
print(">%s" % desc)
buf = es.extract_decoded(tmp, tmp + es.seqlength(3))
print(buf)

Availability
The source code is freely available as part of GenomeTools which is released under a BSD-like open source license. For evaluation, please get the current unstable distribution from the GenomeTools web site. The INSTALL file inside the archive contains more information on building a binary version.

The source code of the benchmarking tools used to measure access time in the publication is also available for documentation purposes. Please note that compiling the benchmark tools requires installation of the NCBI C toolkit, the bx-python toolkit, the BLAT software compiled from source and the associated libraries, and/or the SeqAn C++ library.

Publication
S. Steinbiss and S. Kurtz:
A new efficient data structure for storage and retrieval of multiple biosequences.
IEEE/ACM Transactions on Computational Biology and Bioinformatics, 9(2):345–357 (2012)

Contact
Stefan Kurtz
Center for Bioinformatics, University of Hamburg
Bundesstr. 43, 20146 Hamburg, Germany
Phone +49 40 42838 7311, Fax. +49 40 42838 7312