picovcf Documentation

picovcf is a single-header C++ library for fast/low-memory VCF (Variant Call Format) parsing. Gzipped VCF (.vcf.gz) is also supported (optionally) if you link against zlib.

There are a lot of great tools for processing VCF files out there, but not many C++ libraries that are small (only parsing, no extra functionality) and easy to use. picovcf attempts to fill this niche by providing a header-only library using modern C++ (C++11) that allows clients to be selective about which parts of the VCF file get parsed.

There are two parts to the API:

  1. VCF parsing.

  2. Indexable Genotype Data (IGD) file format support, both reading and writing.

Example usage for both is given below.

Simple Usage Examples

VCF Example: Iterate variants

#include <iostream>
#include "picovcf.hpp"

int main(int argc, char *argv[]) {
  if (argc >= 2) {
    picovcf::VCFFile vcf(argv[1]);
    vcf.seekBeforeVariants();
    while (vcf.hasNextVariant()) {
      vcf.nextVariant();
      picovcf::VCFVariantView variant = vcf.currentVariant();
      std::cout << "Variant at position: " << variant.getPosition() << std::endl;
    }
  }
  return 0;
}

VCF Example: Iterate genotypes

#include <iostream>
#include <cassert>
#include "picovcf.hpp"

int main(int argc, char *argv[]) {
  if (argc >= 2) {
    picovcf::VCFFile vcf(argv[1]);
    vcf.seekBeforeVariants();
    while (vcf.hasNextVariant()) {
      vcf.nextVariant();
      picovcf::VCFVariantView variant = vcf.currentVariant();
      assert(vcf.currentVariant().hasGenotypeData());
      picovcf::IndividualIteratorGT iterator = variant.getIndividualIterator();
      while (iterator.hasNext()) {
        picovcf::IndexT allele1 = 0;
        picovcf::IndexT allele2 = 0;
        bool isPhased = iterator.getAlleles(allele1, allele2);
        std::cout << (isPhased ? "phased" : "unphased") << " alleles: " << allele1;
        if (allele2 != picovcf::NOT_DIPLOID) {
          std::cout << ", " << allele2;
        }
        std::cout << std::endl;
      }
    }
  }
  return 0;
}

IGD Example: Iterate variants

#include <iostream>
#include "picovcf.hpp"

int main(int argc, char *argv[]) {
  if (argc >= 2) {
    picovcf::IGDData igd(argv[1]);
    for (size_t i = 0; i < igd.numVariants(); i++) {
      std::cout << "Variant at position: " << igd.getPosition(i) << std::endl;
    }
  }
  return 0;
}

IGD Example: Iterate genotypes

#include <iostream>
#include "picovcf.hpp"

int main(int argc, char *argv[]) {
  if (argc >= 2) {
    picovcf::IGDData igd(argv[1]);
    for (size_t i = 0; i < igd.numVariants(); i++) {
      std::cout << "Samples with alternate allele \"" << igd.getAltAllele(i) << "\": ";
      auto sampleList = igd.getSamplesWithAlt(i);
      for (picovcf::IndexT sampleIndex : sampleList) {
        std::cout << sampleIndex << ", ";
      }
      std::cout << std::endl;
    }
  }
  return 0;
}