OCaml Hackers

(* TODO *)

This is an example of using ocamllex for parsing structured text using ocamllex entirely. The language (Genbank format) is not yacc-friendly because tokens can't be extracted in a single pass independently from grammar rules.

Input data:

LOCUS       AB000684                 275 bp    DNA     linear   ENV 29-JUL-2005
DEFINITION  Uncultured bacterium gene for 16S rRNA, partial sequence, clone: 2
of cluster I.
ACCESSION AB000684
VERSION AB000684.1 GI:1817504
KEYWORDS ENV.
SOURCE uncultured bacterium
ORGANISM uncultured bacterium
Bacteria; environmental samples.
REFERENCE 1
AUTHORS Inagaki,F., Hayashi,S., Doi,K., Motomura,Y., Izawa,E. and Ogata,S.
TITLE Microbial participation in the formation of siliceous deposits from
geothermal water and analysis of the extremely thermophilic
bacterial community
JOURNAL FEMS Microbiol. Ecol. 24, 41-48 (1997)
REFERENCE 2 (bases 1 to 275)
AUTHORS Inagaki,F., Hayashi,S., Doi,K., Motomura,Y., Izawa,E. and Ogata,S.
TITLE Direct Submission
JOURNAL Submitted (24-JAN-1997) Fumio Inagaki, Faculty of Agriculture,
Kyushu University, Microbial Genetic Division, Institute of Genetic
Resourses; Higashi-ku Hakozaki 6-10-1, Fukuoka-shi, Fukuoka 812-81,
Japan (E-mail:inagaki@agr.kyushu-u.ac.jp, Tel:+81-92-642-3059,
Fax:+81-92-642-3059)
FEATURES Location/Qualifiers
source 1..275
/organism="uncultured bacterium"
/mol_type="genomic DNA"
/db_xref="taxon:77133"
/clone="2 of cluster I"
/environmental_sample
rRNA <1..>275
/product="16S ribosomal RNA"
ORIGIN
1 ctgcccttag ttgccactct tcggagggca ctctaagggg accgccggcg ataagccgag
61 gaaggtgggg atgacgtcag gtcagtatgc cctttatgcc cggggctaca caggcgctac
121 agtggccagg acaatgggaa gcgacccagt aatggggagc aaatccctaa acctggtcat
181 ggtgcagatt gagggctgaa actcgcccct catgaagccg gaatcggtag taatggcgga
241 tcagctaagc cgccgtgaat acgttctcgg gcctt
//


Template for the corresponding ocamllex parser (file genbank.mll):

{  type record = {
    mutable locus : ...;
mutable definition : string option;
mutable accession : ...;
...
}

let new_record () = {
locus = None;
definition = None;
accession = None;
...
}

let newline lexbuf =
...
(* would set the correct line count
for useful error messages *)
}

rule top record = parse
"LOCUS " ...
{ ... }
| "DEFINITION " ([^'\r' '\n']+ as text) '\r'? '\n'
{
newline lexbuf;
let def_text = continue_definition [text] lexbuf in
if record.definition <> None then
... (* error: multiple DEFINITION fields *);
record.definition <- Some def_text
}
| "ACCESSION " ...
{ ... }
...
| "//" '\r'? '\n'
{
newline lexbuf;
Some record
}
| eof
{
... ; (* check that the current record is empty *)
None
}
| ""
{ (* report error *) }


and continue_definition accu = parse
" " ([^'\r' '\n']+ as text) '\r'? '\n'
{
newline lexbuf;
continue_definition (text :: accu) lexbuf
}
| ""
{ String.concat " " (List.rev accu) }


{
let rec scan process_record lexbuf =
match top (new_record ()) lexbuf with
None -> ()
| Some x ->
process_record x;
scan process_record lexbuf
}

This post is directly derived from a post on ocaml-beginners: http://tech.groups.yahoo.com/group/ocaml_beginners/message/10880

Share 

Add a Comment

You need to be a member of OCaml Hackers to add comments!

Join this social network

About

Martin Jambon Martin Jambon created this social network on Ning.

Create your own social network!

Badge

Loading…

© 2009   Created by Martin Jambon on Ning.   Create Your Own Social Network

Badges  |  Report an Issue  |  Privacy  |  Terms of Service