Calcul des sphères d'influence des
atomes d'une protéine
(in-package :cl-user)
(eval-when (compile) (declaim (optimize (speed 3) (safety 1) (space 0) (debug 0))))
;;; Influence spheres of atoms in a protein
;;; Sphères d'influence des atomes d'une protéine
;;; Version 1.0
;;; Author: Francis Leboutte Email: f.leboutte @ algo.be URL: www.algo.be
;;; Donwload: http://www.algo.be/logo1/lisp/prog.html
;;;
;;; Copyright (c) 2004 by Francis Leboutte. Released as free software under the terms
;;; of the GNU General Public License as published by the Free Software Foundation
;;; (GPL version 2 or any later version, see http://www.free-soft.org)
;;;
;;; Documentation (UK, FR)
;;; -------------
;;;
;;; Tree steps :
;;; 1. Parse a Protein Data Bank (PDB) file
;;; see http://www.rcsb.org/pdb/
;;; 2. For each atom in the protein, compute the number of neighbors in
;;; spheres centered on the atom (look for the *influence-radiuses* variable
;;; below).
;;; Atoms in the neighboring, i.e. atoms on the same residue and on the next
;;; residues, can be excluded from the process, following the value in
;;; *rsn-exclusion-distance* variable (RSN, residue sequence number).
;;; 3. Results are written in a CSV file
;;;
;;; Calcul en 3 étapes :
;;; 1. Analyse un fichier de type Protein Data Bank (PDB)
;;; (voir http://www.rcsb.org/pdb/)
;;; 2. Pour chaque atome de la protéine, calcul le nombre d'atomes voisins dans
;;; une sphère de rayon donné centrée sur l'atome, pour différentes valeurs
;;; du rayon (par exemple 8, 16, 32 angstroms), selon un paramètre "rayon
;;; d'influence".
;;; Les atomes voisins, c'est-à-dire situés sur le même acide aminé ou sur
;;; les acides aminés consécutifs, peuvent être exclus du calcul selon un
;;; paramètre dit "distance rsn d'exclusion" (rsn : residue sequence number).
;;; Possibilité de pondérer le décompte selon la distance.
;;; 3.Les résultats sont transcrits dans un fichier csv
;;;
;;; abréviations
;;; ------------
;;; ASN : Atom serial number
;;; RSN : Residue sequence number
;;;
;;; CODE TESTED with Allegro Common Lisp 6.1 ONLY (www.franz.com), should be
;;; ANSI Common Lisp compliant
;;;
#|
How to
------
- compile and load the file
- eval :
(main "G:\\labage\\1AMK.pdb")
ou
(main #p"G:\\labage\\1AMK.pdb")
ou
(main (make-pathname :device "g"
:directory '(:absolute "labage")
:name "1AMK"
:type "pdb"))
Paramètres
----------
*rsn-exclusion-distance* et *influence-radiuses* (voir ci-dessous)
A adapter
---------
pondération, actuellement :
(defun weighted-count (d)
;(/ 2 d)
1)
A METTRE AU POINT ?
-------------------
- répétition d'un atome
Je n'ai pas testé le code avec un fichier PDB qui présente des
répétitions d'atomes :
* If an atom is provided in more than one position, then a non-blank
alternate location indicator must be used as the alternate location
indicator for each of the positions. Within a residue all atoms that are
associated with each other in a given conformation are assigned the same
alternate position indicator.
Actuellement, il y a un test pour détecter ce cas et donné ce message :
atom X is provided in more than one position
L'implantation actuelle ne devrait pas être sensible à une répétition
d'un atome
- traitement de hetatm particulier ?
Notes
-----
- suppose que ASN >= 1 (n'est pas clair dans la documentaton PDB)
- le vecteur des données est adjustable (au prix d'une perte de
perfomance de ± 25%)
OPTIMISATIONS
-------------
Les low level optimisations and compiler switches (voir ci-dessous) font
que le temps de calcul total est divisé par 2.4
Debug
-----
(:dtest
(main #p"G:\\labage\\1AMK.pdb"))
(:dtest
(setf *influence-radiuses* '(32))
(main #p"G:\\labage\\bench.pdb"))
(:dtest
(setf *influence-radiuses* '(32))
(excl:gc t)
(time (process-elements))
))
|#
;;;***************************************************************************************
;;; utilities
(eval-when (:compile-toplevel :load-toplevel :execute)
(defmacro with-gensyms (symbols &body body)
`(let ,(mapcar #'(lambda (symbol)
`(,symbol (gensym)))
symbols)
,@body)))
(defmacro m (&rest arguments)
(with-gensyms (stream)
`(let ((,stream t))
(funcall #'format ,STREAM "~&>>~{ ~:[~S~;~A~]~}"
(mapcan
(lambda (arg) (list (stringp arg) arg))
(list ,@arguments)))
(force-output ,stream)
(values))))
;;; parse-float: reading floats from a string
;;; This optmized parse-float function is based on version from CMU Lisp repository,
;;; please see mkant.copyright for copyright on original version of this function.
(declaim (inline whitespace-p))
(defun whitespace-p (char)
(declare (OPTIMIZE (speed 3) (safety 1)))
(char= char #\space))
(declaim (notinline whitespace-p))
(defun parse-float (string start &optional end (float-type 'double-float))
"Based on parse-float from CMU Lisp repository, by Mark Kantrowitz.
If there's nothing in the string but whitespace return 0. Return 0 too, if there
is nothing but whitespaces and one dot (or a +, a -). End may be NIL (meaning
end of the string)
The decimal character has to be `.'
Digit grouping symbols are not allowed.
Radix is 10 "
(declare (OPTIMIZE (speed 3) (safety 1)))
(declare (INLINE whitespace-p))
(setq end (or end (length string)))
;; Skip over whitespace. If there's nothing but whitespace return 0
(let ((index (or (position-if-not #'whitespace-p string
:start start :end end)
(return-from parse-float (values 0.0 end))))
(minusp nil) (decimalp nil) (found-digit nil)
(before-decimal 0) (after-decimal 0) (decimal-counter 0)
(exponent 0)
(result 0)
(radix 10))
(declare (fixnum index))
;; Take care of optional sign.
(let ((char (char string index)))
(cond ((char= char #\-)
(setq minusp t)
(incf index))
((char= char #\+)
(incf index))))
(loop
until (= index end)
for char = (char string index)
as weight = (digit-char-p char radix)
do
(cond ((and weight (not decimalp))
;; A digit before the decimal point
(setq before-decimal (+ weight (* before-decimal radix))
found-digit t))
((and weight decimalp)
;; A digit after the decimal point
(setq after-decimal (+ weight (* after-decimal radix))
found-digit t)
(incf decimal-counter))
((and (char= char #\.) (not decimalp))
;; The decimal point
(setq decimalp t))
((and (or (char-equal char #\E)
(char-equal char #\D)
(char-equal char #\F)
(char-equal char #\S)
(char-equal char #\L))
(= radix 10))
(multiple-value-bind (num idx)
(parse-integer string :start (1+ index) :end end
:radix radix :junk-allowed NIL)
(setq exponent (or num 0)
index idx)
(when (= index end) (return nil))))
((whitespace-p char)
(when (position-if-not #'whitespace-p string
:start (1+ index) :end end)
(error "There's junk in this string: ~S." string))
(return nil))
(t
(error "There's junk in this string: ~S." string)))
(incf index))
;; Cobble up the resulting number
(setq result (coerce (* (+ before-decimal
(* after-decimal
(coerce (expt radix (- decimal-counter))
float-type)))
(coerce (expt radix exponent)
float-type))
float-type))
;; Return the result
(values
(if found-digit
(if minusp (- result) result)
(coerce 0 float-type))
index)))
(defun gc+ (mode)
#+allegro
(excl:gc mode))
;;;***************************************************************************************
;;; variables and structure
;;; A rsn-distance between 2 atoms of 0 means they are on the same residue. Distance
;;; of 1 means they are on 2 contiguous residues...
;;; If the value in *rsn-exclusion-distance* is 1, atoms on the residue and on
;;; the 2 contiguous are excluded.
;;; See rsn-distance et rsn-distance-exclusion? functions
;;;
;;; La distance rsn entre 2 atomes sous laquelle ils s'excluent du calcul d'influence
;;; Un entier >= 0
;;; Distance rsn =
;;; 0 : veut dire que les 2 atomes sont sur le même acide aminé
;;; 1 : ... sur 2 acides successifs
;;; 2 : ...
;;; voir les fonctions rsn-distance et rsn-distance-exclusion?
(defparameter *rsn-exclusion-distance* 3)
;;; list of radiuses in Angstrom unit
(defparameter *influence-radiuses* '(8 16 32 64))
;;; data about `elements', i.e. ATOM and HETATM
;;; données à propos des « elements » ou atomes (exactement les ATOM and HETATM selon PDB)
;;; chaque element est stocké dans le slot d'indice de son ASN
(defvar *data* nil)
(defvar *elements-nber* 0)
(defvar *greater-ASN* 0)
;;; il semble que la base de numérotation est 1
(defconstant +first-ASN+ 1)
;;; to store data about ATOM and HETATM items
(defstruct element
(ASN 0 :type fixnum)
(name "" :type string)
(residue-name "" :type string)
(RSN 0 :type fixnum)
(x 0.0 :type single-float)
(y 0.0 :type single-float)
(z 0.0 :type single-float)
(neighbors-counts nil :type vector))
(defun coordinates (element)
(values (element-x element)(element-y element)(element-z element)))
;;;***************************************************************************************
(defun main (files)
(format t "~%Initialisation...")
(cond (*data*
(dotimes (i (length *data*))
(setf (aref *data* i) nil))
(gc+ t))
(T
(setf *data*
(make-array 2000 :element-type 'element :adjustable T))))
(setf *elements-nber* 0 *greater-ASN* 0)
(unless (listp files)
(setf files (list files)))
(let ((buffer (make-string 81)))
(format t "~%Lecture...")
(loop for path in files
do
(parse-file path buffer))
(format t "~%Calcul, nombre d'atomes traités :~%")
(process-elements)
(format t "~%Ecriture...")
(write-csv-file (first files))
(values)))
; (:dtest (parse-file #p"G:\\labage\\1AMK.pdb" (make-string 81)))
(defun parse-file (path buffer)
(let ((buffer-length (length buffer))
(greater-ASN 0)
(elements-nber 0)
(l (length *influence-radiuses*))
(current-data-length (first (array-dimensions *data*))))
(with-open-file (stream path :direction :input)
(loop for pos = (read-sequence buffer stream)
while (= pos buffer-length)
finally (progn
(setf *elements-nber* elements-nber
*greater-ASN* greater-ASN)
(format t "~% Nombre d'elements identifiés (atom et hetatm) : ~d" elements-nber)
(format t "~% Plus grand ASN (atom serial number) : ~d" greater-ASN))
do
(let ((atom? (string= "ATOM " buffer :start2 0 :end2 6))
(hetatm? (string= "HETATM" buffer :start2 0 :end2 6))
)
(when (or atom? hetatm?)
(let ((next-ASN (parse-integer buffer :start 6 :end 11)))
(cond ((<= next-ASN greater-ASN)
;; devrait aller de pair avec le message suivant (Alternate location
;; indicator)
(format t "~% L'atome ~d a un ASN plus petit que l'atome précédent (~d)"
next-ASN greater-ASN))
(t (setf greater-ASN next-ASN)
(when (>= next-ASN current-data-length)
(incf current-data-length 500)
(adjust-array *data*
(list current-data-length)
:element-type 'element))))
;; fait plusieurs fois si un atome est présent plusieurs fois ... pas gênant
;(setf (svref *data* next-ASN)
(setf (aref *data* next-ASN)
(make-element
:ASN next-ASN
:name (subseq buffer 12 16)
:residue-name (subseq buffer 17 20)
:RSN (parse-integer buffer :start 22 :end 26)
:x (parse-float buffer 30 38 'single-float)
:y (parse-float buffer 38 46 'single-float)
:z (parse-float buffer 46 54 'single-float)
:neighbors-counts
(make-array l :initial-element 0)))
;; détection Alternate location indicator
(if (char-equal #\space (char buffer 16))
(incf elements-nber)
(format t "~% atom ~d is provided in more than one position" next-ASN))
)))))
(gc+ :tenure)))
;;;***************************************************************************************
;;; square and distance
;;; optimized versions of square and distance
#+never
(defmacro sq (n)
`(* ,n ,n))
#+never
(defun distance (x1 y1 z1 x2 y2 z2)
(sqrt (+ (sq (- x1 x2))
(sq (- y1 y2))
(sq (- z1 z2)))))
(defmacro sq (n &optional (type 'number))
(with-gensyms (v)
`(the ,type
(let ((,v ,n))
(declare (type ,type ,v))
(* ,v ,v)))))
;;; - and + versions optimized for single-float numbers
(defmacro -sf (&rest args)
`(the single-float
(- ,@(loop for x in args collect `(the single-float ,x)))))
(defmacro +sf (&rest args)
`(the single-float
(+ ,@(loop for x in args collect `(the single-float ,x)))))
(defun distance (x1 y1 z1 x2 y2 z2)
(declare (type single-float x1 y1 z1 x2 y2 z2))
(declare (optimize (speed 3) (safety 0) (space 0) (debug 0)))
(the single-float
(sqrt (+sf
(sq (-sf x1 x2) single-float)
(sq (-sf y1 y2) single-float)
(sq (-sf z1 z2) single-float)))))
;;;***************************************************************************************
;;; see *rsn-exclusion-distance*
(defun rsn-distance (element1 element2)
(abs (- (element-rsn element1)(element-rsn element2))))
;;; see *rsn-exclusion-distance*
(defun rsn-distance-exclusion? (element1 element2)
(<= (rsn-distance element1 element2)
*rsn-exclusion-distance*))
(defun weighted-count (d)
(declare (ignore d))
;(/ 2 d)
1)
(defun process-elements ()
(loop for asn from +first-ASN+ to *greater-ASN*
as element = (aref *data* asn)
when element
do
(when (= (mod asn 100) 0)
(format t "~d " asn))
(process-element element asn)))
; (:dtest (process-element (aref *data* 1)))
(defun process-element (element asn)
(multiple-value-bind (x y z)
(coordinates element)
(loop
;; from asn : inutile de calculer la distance de element à element2 et celle de
;; element2 à element...
for a from asn to *greater-ASN*
as element2 = (aref *data* a)
when (and element2
(not (rsn-distance-exclusion? element element2)))
do
(multiple-value-bind (x2 y2 z2)
(coordinates element2)
(loop with distance = (distance x y z x2 y2 z2)
for r in *influence-radiuses*
as i = 0 then (1+ i)
when (< distance r)
do
(let ((weighted-count (weighted-count distance)))
(incf (svref (element-neighbors-counts element) i)
weighted-count)
(incf (svref (element-neighbors-counts element2) i)
weighted-count)))))))
;;;***************************************************************************************
;;; to write csv file
; (make-PDB-csv-pathame #p"G:\\labage\\1AMK.pdb")
;;;> #p"G:\\labage\\1AMK_1.csv"
;;; after file creation :
; (make-PDB-csv-pathame #p"G:\\labage\\1AMK.pdb")
;;;> #p"G:\\labage\\1AMK_2.csv"
;;;
(defun make-PDB-csv-pathame (pdb-pathname)
(let* ((pdb-name (pathname-name pdb-pathname))
(base-name (concatenate 'string
pdb-name
"_"))
(names (mapcar #'pathname-name
(directory (make-pathname :defaults pdb-pathname
:name "*"
:type "csv"))))
(i (loop with max = 0
with base-name-l = (length base-name)
for name in names
when (search base-name name :test #'string-equal)
do
(setf max (max max
(handler-case
(parse-integer name :start base-name-l)
(parse-error () 0))))
finally (return (format nil "~d" (1+ max))))))
(make-pathname :defaults pdb-pathname
:name (concatenate 'string
base-name
i)
:type "csv")))
;;; T : Terminating value
(defmacro write-CSV-T-value (stream value)
(with-gensyms (item)
`(let ((,item ,value))
(if (floatp ,item)
(format stream "~,2F" ,item)
(princ ,item ,stream)))))
;;; NT : Non Terminating value
(defmacro write-CSV-NT-value (stream item)
`(progn
(write-CSV-T-value ,stream ,item)
(princ #\, ,stream)))
;;; no comma after the last value
(defun write-CSV-vector (stream vector)
(loop
with length = (length vector)
with 1-length = (1- length)
for i below 1-length
do
(write-CSV-NT-value stream (svref vector i))
finally
(unless (zerop length)
(write-CSV-T-value stream (svref vector 1-length)))))
;(:dtest (write-csv-file #p"G:\\labage\\1AMK.pdb"))
(defun write-csv-file (pdb-pathname)
(let ((path (make-PDB-csv-pathame pdb-pathname)))
(format t "~% ~A" path)
(with-open-file (stream path
:direction :output
:if-does-not-exist :create)
(format stream "~a"
(with-open-file (stream pdb-pathname :direction :input)
(read-line stream)))
(format stream "~2%Distance (RSN) d'exclusion : ~d" *rsn-exclusion-distance*)
(format stream "~2%,,,,Nbre d'atomes dans sphère" *influence-radiuses*)
(format stream "~%ASN,Name,RSN,R name,~{r=~d~^,~}" *influence-radiuses*)
(loop
for i from +first-ASN+ to *greater-ASN*
as element = (aref *data* i)
when element
do
(terpri stream)
(write-CSV-NT-value stream (element-asn element))
(write-CSV-NT-value stream (element-name element))
(write-CSV-NT-value stream (element-rsn element))
(write-CSV-NT-value stream (element-residue-name element))
(write-CSV-vector stream (element-neighbors-counts element))))
path))