|
| 1 | +/* |
| 2 | + * BioJava development code |
| 3 | + * |
| 4 | + * This code may be freely distributed and modified under the |
| 5 | + * terms of the GNU Lesser General Public Licence. This should |
| 6 | + * be distributed with the code. If you do not have a copy, |
| 7 | + * see: |
| 8 | + * |
| 9 | + * http://www.gnu.org/copyleft/lesser.html |
| 10 | + * |
| 11 | + * Copyright for this code is held jointly by the individual |
| 12 | + * authors. These should be listed in @author doc comments. |
| 13 | + * |
| 14 | + * For more information on the BioJava project and its aims, |
| 15 | + * or to join the biojava-l mailing list, visit the home page |
| 16 | + * at: |
| 17 | + * |
| 18 | + * http://www.biojava.org/ |
| 19 | + * |
| 20 | + */ |
| 21 | +package org.biojava.nbio.core.sequence.io.embl; |
| 22 | + |
| 23 | + |
| 24 | +import java.io.*; |
| 25 | +import java.util.Arrays; |
| 26 | +import java.util.LinkedList; |
| 27 | + |
| 28 | + |
| 29 | +/** |
| 30 | + * This class should process the data of embl file |
| 31 | + * |
| 32 | + * @author Noor Aldeen Al Mbaidin |
| 33 | + * @since 5.0.0 |
| 34 | + */ |
| 35 | +public class EmblReader { |
| 36 | + |
| 37 | + /** |
| 38 | + * The parsing is done in this method.<br> |
| 39 | + * This method tries to process all the Embl records |
| 40 | + * in the File , closes the underlying resource, |
| 41 | + * and return the results in object of EmblRecord.<br> |
| 42 | + * |
| 43 | + * @return EmblRecord containing all the parsed Embl records |
| 44 | + * @throws IOException |
| 45 | + */ |
| 46 | + public static EmblRecord process(File file) throws IOException { |
| 47 | + |
| 48 | + EmblRecord emblRecord = new EmblRecord(); |
| 49 | + StringBuilder sequence = new StringBuilder(""); |
| 50 | + LinkedList<EmblReference> emblReferences = new LinkedList<>(); |
| 51 | + EmblReference emblReference = new EmblReference(); |
| 52 | + LinkedList<String> accessionNumber = new LinkedList<>(); |
| 53 | + LinkedList<String> keyword = new LinkedList<>(); |
| 54 | + |
| 55 | + if (file == null) |
| 56 | + throw new NullPointerException("file can't be null"); |
| 57 | + |
| 58 | + if (file.isDirectory()) |
| 59 | + throw new IllegalArgumentException("the file can't be a directory"); |
| 60 | + |
| 61 | + try (FileReader fileReader = new FileReader(file)) { |
| 62 | + String line = ""; |
| 63 | + String lineIdentifier; |
| 64 | + String lineInfo; |
| 65 | + try (BufferedReader bufferedReader = new BufferedReader(fileReader)) { |
| 66 | + while ((line = bufferedReader.readLine()) != null) { |
| 67 | + if (line.length() > 1) { |
| 68 | + lineInfo = line.substring(2, line.length()).trim(); |
| 69 | + lineIdentifier = line.substring(0, 2); |
| 70 | + if (lineIdentifier.equals("ID")) |
| 71 | + emblRecord.setEmblId(populateID(lineInfo)); |
| 72 | + else if (lineIdentifier.equals("AC")) |
| 73 | + populateAccessionNumber(line, accessionNumber); |
| 74 | + else if (lineIdentifier.equals("DT") && line.contains("Created")) |
| 75 | + emblRecord.setCreatedDate(lineInfo); |
| 76 | + else if (lineIdentifier.equals("DT") && line.contains("updated")) |
| 77 | + emblRecord.setLastUpdatedDate(lineInfo); |
| 78 | + else if (lineIdentifier.equals("DE")) |
| 79 | + emblRecord.setSequenceDescription(lineInfo); |
| 80 | + else if (lineIdentifier.equals("KW")) |
| 81 | + keyword.add(lineInfo); |
| 82 | + else if (lineIdentifier.equals("OS")) |
| 83 | + emblRecord.setOrganismSpecies(lineInfo); |
| 84 | + else if (lineIdentifier.equals("OC")) |
| 85 | + emblRecord.setOrganismClassification(lineInfo); |
| 86 | + else if (lineIdentifier.equals("OG")) |
| 87 | + emblRecord.setOrGanelle(lineInfo); |
| 88 | + else if (lineIdentifier.equals("RN") || lineIdentifier.equals("RP") |
| 89 | + || lineIdentifier.equals("RX") || lineIdentifier.equals("RG") |
| 90 | + || lineIdentifier.equals("RA") || lineIdentifier.equals("RT") |
| 91 | + || lineIdentifier.equals("RL")) |
| 92 | + populateEmblReferences(lineIdentifier, lineInfo, emblReference, emblReferences); |
| 93 | + else if (lineIdentifier.equals("DR")) |
| 94 | + emblRecord.setDatabaseCrossReference(lineInfo); |
| 95 | + else if (lineIdentifier.equals("AH")) |
| 96 | + emblRecord.setAssemblyHeader(lineInfo); |
| 97 | + else if (lineIdentifier.equals("AS")) |
| 98 | + emblRecord.setAssemblyInformation(lineInfo); |
| 99 | + else if (lineIdentifier.equals("CO")) |
| 100 | + emblRecord.setConstructedSequence(lineInfo); |
| 101 | + else if (lineIdentifier.equals("FH")) |
| 102 | + emblRecord.setFeatureHeader(lineInfo); |
| 103 | + else if (lineIdentifier.equals("FT")) |
| 104 | + emblRecord.setFeatureTable(lineInfo); |
| 105 | + else if (lineIdentifier.equals("SQ")) |
| 106 | + emblRecord.setSequenceHeader(lineInfo); |
| 107 | + else if (lineIdentifier.equals(" ") && !lineIdentifier.equals("//")) |
| 108 | + populateSequence(line, sequence); |
| 109 | + else if (lineIdentifier.equals("//")) { |
| 110 | + emblRecord.setKeyword(keyword); |
| 111 | + emblRecord.setEmblReference(emblReferences); |
| 112 | + emblRecord.setAccessionNumber(accessionNumber); |
| 113 | + emblRecord.setSequence(sequence.toString()); |
| 114 | + } |
| 115 | + |
| 116 | + } |
| 117 | + } |
| 118 | + } |
| 119 | + } |
| 120 | + |
| 121 | + return emblRecord; |
| 122 | + } |
| 123 | + |
| 124 | + private static void populateSequence(String line, StringBuilder sequence) { |
| 125 | + String sequenceLine = line.replace(" ", ""). |
| 126 | + replaceAll("[0-9]", ""); |
| 127 | + sequence.append(sequenceLine); |
| 128 | + } |
| 129 | + |
| 130 | + private static void populateEmblReferences(String lineIdentifier, String lineInfo, EmblReference emblReference |
| 131 | + , LinkedList<EmblReference> emblReferences) { |
| 132 | + if (lineIdentifier.equals("RN")) |
| 133 | + emblReference.setReferenceNumber(lineInfo); |
| 134 | + else if (lineIdentifier.equals("RP")) |
| 135 | + emblReference.setReferencePosition(lineInfo); |
| 136 | + else if (lineIdentifier.equals("RX")) |
| 137 | + emblReference.setReferenceCrossReference(lineInfo); |
| 138 | + else if (lineIdentifier.equals("RG")) |
| 139 | + emblReference.setReferenceGroup(lineInfo); |
| 140 | + else if (lineIdentifier.equals("RA")) |
| 141 | + emblReference.setReferenceAuthor(lineInfo); |
| 142 | + else if (lineIdentifier.equals("RT")) |
| 143 | + emblReference.setReferenceTitle(lineInfo); |
| 144 | + else if (lineIdentifier.equals("RL")) { |
| 145 | + emblReference.setReferenceLocation(lineInfo); |
| 146 | + emblReferences.add(emblReference.copyEmblReference(emblReference)); |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + private static void populateAccessionNumber(String line, LinkedList<String> accessionNumber) { |
| 151 | + accessionNumber.add(line); |
| 152 | + } |
| 153 | + |
| 154 | + private static EmblId populateID(String line) { |
| 155 | + String[] strings = line.split(";"); |
| 156 | + Arrays.stream(strings).map(String::trim).toArray(unused -> strings); |
| 157 | + EmblId emblId = new EmblId(strings[0], strings[1], strings[2] |
| 158 | + , strings[3], strings[4], strings[5], strings[6]); |
| 159 | + return emblId; |
| 160 | + } |
| 161 | + |
| 162 | + |
| 163 | +} |
0 commit comments