Source code for sknano.io.tokenizers.pdb

# -*- coding: utf-8 -*-
"""
=======================================================
PDB tokenizer class (:mod:`sknano.io.tokenizers.pdb`)
=======================================================

.. currentmodule:: sknano.io.tokenizers.pdb

"""
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
__docformat__ = 'restructuredtext en'

from collections import OrderedDict

from pyparsing import Combine, Empty, Forward, Group, Keyword, Literal, \
    OneOrMore, Optional, Word, White, Suppress, Regex, ZeroOrMore, \
    alphas, alphanums, oneOf, nums, matchOnlyAtCol, restOfLine

from sknano.core import integer, real, asint
from sknano.core.crystallography import Crystal3DLattice
from sknano.core.refdata import element_symbols
from sknano.core.atoms import StructureAtom as Atom

__all__ = ['PDBTokenizer']


records = OrderedDict()

records['HEADER'] = header = {}
records['OBSLTE'] = obslte = {}
records['TITLE'] = title = {}
title['format'] = ''

records['SPLIT'] = split = {}
records['CAVEAT'] = caveat = {}
records['COMPND'] = compnd = {}
records['SOURCE'] = source = {}
records['KEYWDS'] = keywds = {}
records['EXPDTA'] = expdta = {}
records['NUMMDL'] = nummdl = {}
records['MDLTYP'] = mdltyp = {}
records['AUTHOR'] = author = {}
records['REVDAT'] = revdat = {}
records['SPRSDE'] = sprsde = {}
records['JRNL'] = jrnl = {}
records['REMARK 0'] = remark_0 = {}
records['REMARK 1'] = remark_1 = {}
records['REMARK 2'] = remark_2 = {}
records['REMARK 3'] = remark_3 = {}
records['REMARK N'] = remark_N = {}
records['DBREF'] = dbref = {}
records['DBREF1'] = dbref1 = {}
records['DBREF2'] = dbref2 = {}
records['SEQADV'] = seqadv = {}
records['SEQRES'] = seqres = {}
records['MODRES'] = modres = {}
records['HET'] = het = {}
records['HETNAM'] = hetnam = {}
records['HETSYN'] = hetsyn = {}
records['FORMUL'] = formul = {}
records['HELIX'] = helix = {}
records['SHEET'] = sheet = {}
records['SSBOND'] = ssbond = {}
records['LINK'] = link = {}
records['CISPEP'] = cispep = {}
records['SITE'] = site = {}
records['CRYST1'] = cryst1 = {}
records['ORIGX1'] = origx1 = {}
records['ORIGX2'] = origx2 = {}
records['ORIGX3'] = origx3 = {}
records['SCALE1'] = scale1 = {}
records['SCALE2'] = scale2 = {}
records['SCALE3'] = scale3 = {}
records['MTRIX1'] = mtrix1 = {}
records['MTRIX2'] = mtrix2 = {}
records['MTRIX3'] = mtrix3 = {}
records['MODEL'] = model = {}
records['ATOM'] = atom = {}

atom['format'] = \
    '{:6}{:>5} {:>4}{:1}{:>3} {:1}{:>4}{:1}' + \
    '{:3}'.format('') + \
    '{:>8.3f}{:>8.3f}{:>8.3f}' + \
    '{:>6.2f}{:>6.2f}' + \
    '{:10}'.format('') + \
    '{:2}{:2}'

records['ANISOU'] = {}
records['TER'] = {}
records['HETATM'] = {}
records['ENDMDL'] = {}
records['CONECT'] = conect = {}
records['MASTER'] = master = {}
records['END'] = None

DASH = Literal('-') | Literal('-')
SPACE = Literal(' ')
BACKSLASH = Literal('\\')
COMMA = Literal(',')
COLON = Literal(':')
SEMICOLON = Literal(';')
NULL = Literal('NULL')

pdbcharset = alphanums + " `-=[]'./~!@#$%^&*()_+{}|\"<>?"

# classification = Word(alphanums, alphanums + '/', max=40)

special_chars = ',:;'

# def string_field(exact=0):
#     return Word(pdbcharset, exact=exact)


def string_field_with_special_chars(chars, exact=0):
    return Word(pdbcharset + chars, exact=exact)

string_field = Word(pdbcharset)

# csv_list = Group(delimitedList(string_field, delim=','))
csv_list = Group(string_field + ZeroOrMore(Suppress(COMMA) + string_field))
slist = Group(string_field + ZeroOrMore(Suppress(SEMICOLON) + string_field))

lstring = string_field_with_special_chars(',:;') + \
    ZeroOrMore(OneOrMore(White(' ')) + string_field)

# slist = Group(delimitedList(string_field, delim=';'))
# lstring = Group(delimitedList(string_field, delim=' '))


def padexpr(expr, padwidth):
    return expr + nspaces(padwidth)


def nspaces(n):
    return Suppress(White(' ', exact=n))


def character_field(width=None, ci=None, cf=None, padwidth=None):
    if width is None:
        width = (cf - ci) + 1
    if padwidth is None:
        padwidth = 1

    field = nspaces(width)
    padding = nspaces(padwidth)

    if width == 1:
        field |= Word(alphanums + ' ', exact=width)
    else:
        n = 0
        while width > 0:
            width -= 1
            n += 1
            field |= (nspaces(width) + Word(alphanums + ' ', exact=n))
    return field + padding


def integer_field(width=None, ci=None, cf=None, padwidth=None):
    if width is None:
        width = (cf - ci) + 1
    if padwidth is None:
        padwidth = 1

    field = nspaces(width)
    padding = nspaces(padwidth)
    n = 0
    while width > 0:
        width -= 1
        n += 1
        field |= (nspaces(width) + Word(nums, exact=n).setParseAction(asint))
    if padwidth == 0:
        return field
    else:
        return field + padding


def lstring_field(width=None, ci=None, cf=None, padwidth=None):
    if width is None:
        width = (cf - ci) + 1
    if padwidth is None:
        padwidth = 1

    field = nspaces(width)
    padding = nspaces(padwidth)
    n = 0
    while width > 0:
        width -= 1
        n += 1
        field |= (nspaces(width) +
                  string_field_with_special_chars(',:;', exact=n))
    if padwidth == 0:
        return field
    else:
        return field + padding


def continuation_field(width):
    field = nspaces(width)
    n = 0
    while width > 0:
        width -= 1
        n += 1
        field |= (nspaces(width) + Word(nums, exact=n).addParseAction(asint))
    return field


def Record(name, width=None, ci=None, cf=None, padwidth=None):
    if width is None:
        if ci is not None and cf is not None:
            width = (cf - ci) + 1
        else:
            width = len(name)
    if padwidth is None:
        padwidth = 0
    if padwidth != 0:
        return Keyword(name) + nspaces(padwidth)
    else:
        return Keyword(name)


def new_atom(s, l, t):
    element = getattr(t, 'element', getattr(t, 'name', 'X'))
    params = dict(element=element, serial=t.serial, x=t.x, y=t.y, z=t.z)
    return Atom(**params)


def new_xtal_lattice(s, l, t):
    params = dict(a=t.a, b=t.b, c=t.c,
                  alpha=t.alpha, beta=t.beta, gamma=t.gamma)
    return Crystal3DLattice(**params)


day = Word(nums, exact=2)
month = Word(alphas, exact=3)
year = Word(nums, exact=2)
date_field = Combine(day + DASH + month + DASH + year)

id_code = Word(nums, alphanums, exact=4)
token = Word(alphas, alphanums + '_-() ') + Literal(':') + SPACE
token_value = Word(alphanums, alphanums + '_-() ')
speclist = Combine(OneOrMore(token + token_value) + Optional(SEMICOLON))

header_expr = \
    Record("HEADER", padwidth=4) + string_field + \
    Optional(date_field + id_code)
title_expr = \
    Record("TITLE ", padwidth=2) + continuation_field(2) + string_field
compnd_expr = Record("COMPND", padwidth=1) + continuation_field(3) + speclist
source_expr = Record("SOURCE", padwidth=1) + continuation_field(3) + speclist
keywds_expr = Record("KEYWDS", padwidth=2) + continuation_field(2) + csv_list
expdta_expr = Record("EXPDTA", padwidth=2) + continuation_field(2) + slist
author_expr = Record("AUTHOR", padwidth=2) + continuation_field(2) + csv_list
revdat_expr = Record("REVDAT", padwidth=1) + Word(nums, min=1, max=3) + \
    (continuation_field(2) + date_field | date_field) + \
    id_code + Word(nums, exact=1) + ZeroOrMore(lstring)

#     Optional(date_field("modDate") + Suppress(SPACE) + id_code("modId")) + \
#     Suppress(OneOrMore(SPACE)) + Word(nums, max=1)("modType") + \
#     Suppress(OneOrMore(SPACE)) + \
#     Optional(delimitedList(Word(alphas), delim=' ')) + StringEnd()

jrnl_records = Forward()
jrnl_expr = Record("JRNL", width=6, padwidth=6) + jrnl_records


jrnl_auth_expr = Keyword('AUTH') + continuation_field(2) + csv_list
jrnl_titl_expr = Keyword('TITL') + continuation_field(2) + lstring
jrnl_edit_expr = Keyword('EDIT') + continuation_field(2) + lstring
jrnl_ref_expr = Keyword('REF') + lstring
jrnl_publ_expr = Keyword('PUBL') + continuation_field(2) + lstring
jrnl_refn_expr = Keyword('REFN') + (Keyword("ISSN") | Keyword("ESSN")) + \
    lstring
jrnl_pmid_expr = Keyword('PMID') + integer
jrnl_doi_expr = Keyword('DOI') + lstring

jrnl_records << \
    (jrnl_auth_expr | jrnl_titl_expr | jrnl_edit_expr | jrnl_ref_expr |
     jrnl_publ_expr | jrnl_refn_expr | jrnl_pmid_expr | jrnl_doi_expr)

remark_expr = Record("REMARK", padwidth=1) + integer("remarkNum") + \
    (nspaces(2) + jrnl_records | nspaces(1) + lstring | Empty())

cryst_expr = Record("CRYST1") + \
    real("a") + real("b") + real("c") + \
    real("alpha") + real("beta") + real("gamma") + \
    Group(Literal('P') + integer + Optional(integer + integer)) + integer
cryst_expr.setParseAction(new_xtal_lattice)

origx_expr = Regex(r'ORIGX[123]') + \
    real("On1") + real("On2") + real("On3") + real("Tn")

scale_expr = Regex(r'SCALE[123]') + \
    real("Sn1") + real("Sn2") + real("Sn3") + real("Un")

master_expr = Record("MASTER", padwidth=4) + integer("numRemark") + \
    Suppress(Literal("0")) + integer("numHet") + integer("numHelix") + \
    integer("numSheet") + integer("numTurn") + integer("numSite") + \
    integer("numXform") + integer("numCoord") + integer("numTer") + \
    integer("numConect") + integer("numSeq")
obslte_expr = Keyword("OBSLTE") + restOfLine
split_expr = Keyword("SPLIT ") + restOfLine
caveat_expr = Keyword("CAVEAT") + restOfLine
nummdl_expr = Keyword("NUMMDL") + restOfLine
mdltyp_expr = Keyword("MDLTYP") + restOfLine
sprsde_expr = Keyword("SPRSDE") + restOfLine
dbref_expr = Record("DBREF ") + padexpr(id_code, padwidth=1) + \
    character_field(width=1, padwidth=1) + \
    integer_field(width=4, padwidth=0) + \
    character_field(width=1, padwidth=1) + \
    integer_field(width=4, padwidth=0) + \
    character_field(width=1, padwidth=1) + restOfLine
    # lstring_field(ci=27, cf=32, padwidth=1) + restOfLine

seqadv_expr = Record("SEQADV") + id_code + restOfLine
seqres_expr = Record("SEQRES") + restOfLine
modres_expr = Record("MODRES") + restOfLine
het_expr = Record("HET", width=6) + restOfLine
hetnam_expr = Record("HETNAM") + restOfLine
hetsyn_expr = Record("HETSYN") + restOfLine
formul_expr = Record("FORMUL") + restOfLine
helix_expr = Record("HELIX ") + restOfLine
sheet_expr = Record("SHEET ") + restOfLine
ssbond_expr = Record("SSBOND") + restOfLine
link_expr = Record("LINK  ") + restOfLine
cispep_expr = Record("CISPEP") + restOfLine
site_expr = Record("SITE  ") + restOfLine
mtrix_expr = Regex(r'MTRIX[123]') + integer("serial") + \
    real("Mn1") + real("Mn2") + real("Mn3") + real("Vn") + (integer | Empty())

model_expr = Record("MODEL ", width=6, padwidth=4) + integer("serial")
endmdl_expr = Record("ENDMDL")

anisou_expr = Record("ANISOU") + integer("serial") + \
    Word(alphanums, min=1, max=4)("name") + \
    (Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(17))("altLoc") |
     nspaces(1)) + \
    Word(alphanums, min=1, max=3)("resName") + \
    (Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(22))("chainID") |
     nspaces(1)) + \
    Word(nums, min=1, max=4).setParseAction(asint)("resSeq") + \
    (Word(alphas, exact=1).addParseAction(matchOnlyAtCol(27))("iCode") |
     nspaces(1)) + \
    integer("U11") + integer("U22") + integer("U33") + \
    integer("U12") + integer("U13") + integer("U23") + \
    Optional(oneOf(' '.join(element_symbols))("element")) + \
    Optional(Combine(integer + oneOf('+ -'))("charge"))

ter_expr = Record("TER", width=6) + integer("serial") + \
    Word(alphanums, min=1, max=3)("resName") + \
    Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(22))("chainID") + \
    Word(nums, min=1, max=4).setParseAction(asint)("resSeq") + \
    Optional(Word(alphas, exact=1)
             .addParseAction(matchOnlyAtCol(27))("iCode") | nspaces(1))

conect_expr = Record("CONECT") + integer("serial") + OneOrMore(integer)
end_expr = Record("END", width=6)

atom_expr = Record("ATOM", width=6) + integer("serial") + \
    Word(alphanums, min=1, max=4)("name") + \
    (Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(17))("altLoc") |
     nspaces(1)) + \
    Word(alphanums, min=1, max=3)("resName") + \
    (Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(22))("chainID") |
     nspaces(1)) + \
    Word(nums, min=1, max=4).setParseAction(asint)("resSeq") + \
    (Word(alphas, exact=1).addParseAction(matchOnlyAtCol(27))("iCode") |
     nspaces(1)) + \
    real("x") + real("y") + real("z") + \
    real("occupancy") + real("tempFactor") + \
    Optional(oneOf(' '.join(element_symbols))("element")) + \
    Optional(Combine(integer + oneOf('+ -'))("charge"))
atom_expr.setParseAction(new_atom)

hetatm_expr = Record("HETATM") + integer("serial") + \
    Word(alphanums, min=1, max=4)("name") + \
    (Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(17))("altLoc") |
     nspaces(1)) + \
    (Word(alphanums, min=1, max=3)("resName") | nspaces(3)) + \
    (Word(alphanums, exact=1).addParseAction(matchOnlyAtCol(22))("chainID") |
     nspaces(1)) + \
    (Word(nums, min=1, max=4).setParseAction(asint)("resSeq") |
     nspaces(4)) + \
    (Word(alphas, exact=1).addParseAction(matchOnlyAtCol(27))("iCode") |
     nspaces(1)) + \
    real("x") + real("y") + real("z") + \
    real("occupancy") + real("tempFactor") + \
    Optional(oneOf(' '.join(element_symbols))("element")) + \
    Optional(Combine(integer + oneOf('+ -'))("charge"))
hetatm_expr.setParseAction(new_atom)

mandatory_records = ['HEADER', 'TITLE', 'COMPND', 'SOURCE', 'KEYWDS',
                     'EXPDTA', 'AUTHOR', 'REVDAT', 'REMARK 2',
                     'REMARK 3', 'SEQRES', 'CRYST1', 'ORIGX1',
                     'ORIGX2', 'ORIGX3', 'SCALE1', 'SCALE2', 'SCALE3',
                     'MASTER', 'END']
optional_records = ['OBSLTE', 'SPLIT', 'CAVEAT', 'NUMMDL', 'MDLTYP',
                    'SPRSDE', 'JRNL', 'REMARK 0', 'REMARK 1',
                    'REMARK N', 'DBREF', 'DBREF1', 'DBREF2',
                    'SEQADV', 'MODRES', 'HET', 'HETNAM', 'HETSYN',
                    'FORMUL', 'HELIX', 'SHEET', 'SSBOND', 'LINK',
                    'CISPEP', 'SITE', 'MTRIX1', 'MTRIX2', 'MTRIX3',
                    'MODEL', 'ATOM', 'ANISOU', 'TER', 'HETATM',
                    'ENDMDL', 'CONECT']

record_expr = Forward()
record_expr << (header_expr | title_expr | compnd_expr | source_expr |
                keywds_expr | expdta_expr | author_expr | revdat_expr |
                remark_expr | seqres_expr | cryst_expr | origx_expr |
                scale_expr | master_expr | end_expr | obslte_expr |
                split_expr | caveat_expr | nummdl_expr | mdltyp_expr |
                sprsde_expr | jrnl_expr | dbref_expr | seqadv_expr |
                modres_expr | het_expr | hetnam_expr | hetsyn_expr |
                formul_expr | helix_expr | sheet_expr | ssbond_expr |
                link_expr | cispep_expr | site_expr | mtrix_expr |
                model_expr | atom_expr | anisou_expr | ter_expr |
                hetatm_expr | endmdl_expr | conect_expr)


[docs]class PDBTokenizer: """PDB Record tokenizer.""" def __init__(self): self.parse_results = OrderedDict() self.raw_fields = OrderedDict() def _update_records(self, record, fields, parse_result): self.raw_fields[record] = fields self.parse_results[record] = parse_result def _parse_header(self, fields): # print('fields: {}'.format(fields)) result = header_expr.parseString(fields) self._update_records('HEADER', fields, result) return result # classification = fields[10:50] # dep_date = fields[50:59] # id_code = fields[63:66] # print(classification) # print(dep_date) # print(id_code) def _parse_title(self, fields): result = title_expr.parseString(fields) self._update_records('TITLE', fields, result) return result def _parse_compnd(self, fields): pass def _parse_source(self, fields): pass def _parse_keywds(self, fields): pass def _parse_expdta(self, fields): pass def _parse_author(self, fields): pass def _parse_revdat(self, fields): pass def _parse_jrnl(self, fields): pass def _parse_remark(self, fields): pass def _parse_dbref(self, fields): pass def _parse_seqadv(self, fields): pass def _parse_seqres(self, fields): pass def _parse_formul(self, fields): pass def _parse_helix(self, fields): pass def _parse_sheet(self, fields): pass def _parse_ssbond(self, fields): pass def _parse_cryst1(self, fields): pass def _parse_origx1(self, fields): pass def _parse_origx2(self, fields): pass def _parse_origx3(self, fields): pass def _parse_scale1(self, fields): pass def _parse_scale2(self, fields): pass def _parse_scale3(self, fields): pass def _parse_atom(self, fields): result = atom_expr.parseString(fields, parseAll=True)[0] self._update_records('ATOM', fields, result) return result def _parse_ter(self, fields): pass def _parse_hetatm(self, fields): result = hetatm_expr.parseString(fields, parseAll=True)[0] self._update_records('HETATM', fields, result) return result def _parse_conect(self, fields): pass def _parse_master(self, fields): pass def _parse_end(self, fields): pass