X Tutup
Skip to content

Commit 615c03d

Browse files
authored
Merge pull request #713 from noureddine-mb/emblParser
Add an EMBL file parser to BioJava #621
2 parents f80f8bd + 40ddf40 commit 615c03d

File tree

6 files changed

+974
-0
lines changed

6 files changed

+974
-0
lines changed
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/*
2+
* BioJava development code
3+
*
4+
* This code may be freely distributed and modified under the
5+
* terms of the GNU Lesser General Public Licence. This should
6+
* be distributed with the code. If you do not have a copy,
7+
* see:
8+
*
9+
* http://www.gnu.org/copyleft/lesser.html
10+
*
11+
* Copyright for this code is held jointly by the individual
12+
* authors. These should be listed in @author doc comments.
13+
*
14+
* For more information on the BioJava project and its aims,
15+
* or to join the biojava-l mailing list, visit the home page
16+
* at:
17+
*
18+
* http://www.biojava.org/
19+
*
20+
*/
21+
package org.biojava.nbio.core.sequence.io.embl;
22+
23+
import jdk.nashorn.internal.ir.annotations.Immutable;
24+
25+
/**
26+
* This class contains the processed data of embl file
27+
* Primary accession number
28+
* Sequence version number
29+
* Topology: 'circular' or 'linear'
30+
* Molecule type
31+
* Data class
32+
* Taxonomic division
33+
* Sequence length
34+
*
35+
* @author Noor Aldeen Al Mbaidin
36+
* @since 5.0.0
37+
*/
38+
@Immutable
39+
public class EmblId {
40+
41+
42+
private final String primaryAccession;
43+
private final String sequenceVersion;
44+
private final String topology;
45+
private final String moleculeType;
46+
private final String dataClass;
47+
private final String taxonomicDivision;
48+
private final String sequenceLength;
49+
50+
public EmblId(String primaryAccession, String sequenceVersion, String topology,
51+
String moleculeType, String dataClass, String taxonomicDivision,
52+
String sequenceLength) {
53+
this.primaryAccession = primaryAccession;
54+
this.sequenceVersion = sequenceVersion;
55+
this.topology = topology;
56+
this.moleculeType = moleculeType;
57+
this.dataClass = dataClass;
58+
this.taxonomicDivision = taxonomicDivision;
59+
this.sequenceLength = sequenceLength;
60+
}
61+
62+
/**
63+
* @return String
64+
*/
65+
public String getPrimaryAccession() {
66+
return primaryAccession;
67+
}
68+
69+
/**
70+
* return the sequence version
71+
*
72+
* @return String
73+
*/
74+
public String getSequenceVersion() {
75+
return sequenceVersion;
76+
}
77+
78+
public String getTopology() {
79+
return topology;
80+
}
81+
82+
/**
83+
* Molecule type this represents the type of molecule as stored
84+
*
85+
* @return String
86+
*/
87+
public String getMoleculeType() {
88+
return moleculeType;
89+
}
90+
91+
public String getDataClass() {
92+
return dataClass;
93+
}
94+
95+
/**
96+
* @return String
97+
*/
98+
public String getTaxonomicDivision() {
99+
return taxonomicDivision;
100+
}
101+
102+
/**
103+
* Sequence length The last item on the ID line is the length of the
104+
* sequence (the total number of bases in the sequence). This number includes
105+
* base positions reported as present but undetermined (coded as "N").
106+
*
107+
* @return String
108+
*/
109+
public String getSequenceLength() {
110+
return sequenceLength;
111+
}
112+
113+
}
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
/*
2+
* BioJava development code
3+
*
4+
* This code may be freely distributed and modified under the
5+
* terms of the GNU Lesser General Public Licence. This should
6+
* be distributed with the code. If you do not have a copy,
7+
* see:
8+
*
9+
* http://www.gnu.org/copyleft/lesser.html
10+
*
11+
* Copyright for this code is held jointly by the individual
12+
* authors. These should be listed in @author doc comments.
13+
*
14+
* For more information on the BioJava project and its aims,
15+
* or to join the biojava-l mailing list, visit the home page
16+
* at:
17+
*
18+
* http://www.biojava.org/
19+
*
20+
*/
21+
package org.biojava.nbio.core.sequence.io.embl;
22+
23+
24+
import java.io.*;
25+
import java.util.Arrays;
26+
import java.util.LinkedList;
27+
28+
29+
/**
30+
* This class should process the data of embl file
31+
*
32+
* @author Noor Aldeen Al Mbaidin
33+
* @since 5.0.0
34+
*/
35+
public class EmblReader {
36+
37+
/**
38+
* The parsing is done in this method.<br>
39+
* This method tries to process all the Embl records
40+
* in the File , closes the underlying resource,
41+
* and return the results in object of EmblRecord.<br>
42+
*
43+
* @return EmblRecord containing all the parsed Embl records
44+
* @throws IOException
45+
*/
46+
public static EmblRecord process(File file) throws IOException {
47+
48+
EmblRecord emblRecord = new EmblRecord();
49+
StringBuilder sequence = new StringBuilder("");
50+
LinkedList<EmblReference> emblReferences = new LinkedList<>();
51+
EmblReference emblReference = new EmblReference();
52+
LinkedList<String> accessionNumber = new LinkedList<>();
53+
LinkedList<String> keyword = new LinkedList<>();
54+
55+
if (file == null)
56+
throw new NullPointerException("file can't be null");
57+
58+
if (file.isDirectory())
59+
throw new IllegalArgumentException("the file can't be a directory");
60+
61+
try (FileReader fileReader = new FileReader(file)) {
62+
String line = "";
63+
String lineIdentifier;
64+
String lineInfo;
65+
try (BufferedReader bufferedReader = new BufferedReader(fileReader)) {
66+
while ((line = bufferedReader.readLine()) != null) {
67+
if (line.length() > 1) {
68+
lineInfo = line.substring(2, line.length()).trim();
69+
lineIdentifier = line.substring(0, 2);
70+
if (lineIdentifier.equals("ID"))
71+
emblRecord.setEmblId(populateID(lineInfo));
72+
else if (lineIdentifier.equals("AC"))
73+
populateAccessionNumber(line, accessionNumber);
74+
else if (lineIdentifier.equals("DT") && line.contains("Created"))
75+
emblRecord.setCreatedDate(lineInfo);
76+
else if (lineIdentifier.equals("DT") && line.contains("updated"))
77+
emblRecord.setLastUpdatedDate(lineInfo);
78+
else if (lineIdentifier.equals("DE"))
79+
emblRecord.setSequenceDescription(lineInfo);
80+
else if (lineIdentifier.equals("KW"))
81+
keyword.add(lineInfo);
82+
else if (lineIdentifier.equals("OS"))
83+
emblRecord.setOrganismSpecies(lineInfo);
84+
else if (lineIdentifier.equals("OC"))
85+
emblRecord.setOrganismClassification(lineInfo);
86+
else if (lineIdentifier.equals("OG"))
87+
emblRecord.setOrGanelle(lineInfo);
88+
else if (lineIdentifier.equals("RN") || lineIdentifier.equals("RP")
89+
|| lineIdentifier.equals("RX") || lineIdentifier.equals("RG")
90+
|| lineIdentifier.equals("RA") || lineIdentifier.equals("RT")
91+
|| lineIdentifier.equals("RL"))
92+
populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences);
93+
else if (lineIdentifier.equals("DR"))
94+
emblRecord.setDatabaseCrossReference(lineInfo);
95+
else if (lineIdentifier.equals("AH"))
96+
emblRecord.setAssemblyHeader(lineInfo);
97+
else if (lineIdentifier.equals("AS"))
98+
emblRecord.setAssemblyInformation(lineInfo);
99+
else if (lineIdentifier.equals("CO"))
100+
emblRecord.setConstructedSequence(lineInfo);
101+
else if (lineIdentifier.equals("FH"))
102+
emblRecord.setFeatureHeader(lineInfo);
103+
else if (lineIdentifier.equals("FT"))
104+
emblRecord.setFeatureTable(lineInfo);
105+
else if (lineIdentifier.equals("SQ"))
106+
emblRecord.setSequenceHeader(lineInfo);
107+
else if (lineIdentifier.equals(" ") && !lineIdentifier.equals("//"))
108+
populateSequence(line, sequence);
109+
else if (lineIdentifier.equals("//")) {
110+
emblRecord.setKeyword(keyword);
111+
emblRecord.setEmblReference(emblReferences);
112+
emblRecord.setAccessionNumber(accessionNumber);
113+
emblRecord.setSequence(sequence.toString());
114+
}
115+
116+
}
117+
}
118+
}
119+
}
120+
121+
return emblRecord;
122+
}
123+
124+
private static void populateSequence(String line, StringBuilder sequence) {
125+
String sequenceLine = line.replace(" ", "").
126+
replaceAll("[0-9]", "");
127+
sequence.append(sequenceLine);
128+
}
129+
130+
private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference
131+
, LinkedList<EmblReference> emblReferences) {
132+
if (lineIdentifier.equals("RN"))
133+
emblReference.setReferenceNumber(lineInfo);
134+
else if (lineIdentifier.equals("RP"))
135+
emblReference.setReferencePosition(lineInfo);
136+
else if (lineIdentifier.equals("RX"))
137+
emblReference.setReferenceCrossReference(lineInfo);
138+
else if (lineIdentifier.equals("RG"))
139+
emblReference.setReferenceGroup(lineInfo);
140+
else if (lineIdentifier.equals("RA"))
141+
emblReference.setReferenceAuthor(lineInfo);
142+
else if (lineIdentifier.equals("RT"))
143+
emblReference.setReferenceTitle(lineInfo);
144+
else if (lineIdentifier.equals("RL")) {
145+
emblReference.setReferenceLocation(lineInfo);
146+
emblReferences.add(emblReference.copyEmblReference(emblReference));
147+
}
148+
}
149+
150+
private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) {
151+
accessionNumber.add(line);
152+
}
153+
154+
private static EmblId populateID(String line) {
155+
String[] strings = line.split(";");
156+
Arrays.stream(strings).map(String::trim).toArray(unused -> strings);
157+
EmblId emblId = new EmblId(strings[0], strings[1], strings[2]
158+
, strings[3], strings[4], strings[5], strings[6]);
159+
return emblId;
160+
}
161+
162+
163+
}

0 commit comments

Comments
 (0)
X Tutup