-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathElement.java
More file actions
383 lines (304 loc) · 12.3 KB
/
Element.java
File metadata and controls
383 lines (304 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
package javaxt.html;
import javaxt.xml.DOM;
//******************************************************************************
//** HTML Element
//******************************************************************************
/**
* Used to represent a DOM element in an HTML document.
*
******************************************************************************/
public class Element {
private String nodeName;
private String attributes;
private String innerHTML;
private String outerHTML;
private Parser parser;
private boolean isClosed;
//**************************************************************************
//** Constructor
//**************************************************************************
/** @param outerHTML HTML used to define a tag (e.g. <div id="1">test</div>)
*/
public Element(String outerHTML) throws Exception {
this.outerHTML = outerHTML;
//Get position of the first whitespace character
int ws = -1;
for (int z=0; z<outerHTML.length(); z++){
char t = outerHTML.charAt(z);
if (isWhitespace(t)){
ws = z+1;
break;
}
}
//Get position of the first ">" character
int gt = Parser.findGT(0, outerHTML+" ");
if (gt==-1) throw new Exception("Invalid or unsupported html"); //includes html comments
gt++;
//Set nodeName and attributes
int endNode;
if (ws==-1){
endNode = gt-1;
attributes = "";
}
else{
if (gt<ws){
endNode = gt-1;
attributes = "";
}
else{
endNode = ws;
attributes = outerHTML.substring(endNode, gt-1).trim();
if (attributes.endsWith("/")) attributes = attributes.substring(0, attributes.length()-1).trim();
}
}
nodeName = outerHTML.substring(1, endNode);
if (nodeName.endsWith("/")) nodeName = nodeName.substring(0, nodeName.length()-1);
nodeName = nodeName.trim();
//Set innerHTML
innerHTML = outerHTML.substring(gt);
if (innerHTML.trim().length()>0){ //has inner html
int idx = innerHTML.lastIndexOf("</" + nodeName);
if (idx>-1){
innerHTML = innerHTML.substring(0, idx);
isClosed = true;
}
else{
isClosed = false;
}
}
else{ //no inner html
//Check if the tag is self enclosing (e.g. "<hr/>")
char c = outerHTML.charAt(gt-2); //character immediately before ">"
isClosed = c=='/';
}
}
//**************************************************************************
//** getName
//**************************************************************************
/** Returns the node name associated with the element
*/
public String getName(){
return nodeName;
}
//**************************************************************************
//** getAttributes
//**************************************************************************
/** Returns the element attributes
* @return Anything that appears after the tag/node name
*/
public String getAttributes(){
return attributes;
}
//**************************************************************************
//** isClosed
//**************************************************************************
/** Returns true if the element has a closing tag or is self closing
*/
public boolean isClosed(){
return isClosed;
}
//**************************************************************************
//** getInnerHTML
//**************************************************************************
/** Returns the HTML content (inner HTML) of an element as a String.
*/
public String getInnerHTML(){
return innerHTML;
}
//**************************************************************************
//** getOuterHTML
//**************************************************************************
/** Returns the HTML used to define this element (tag and attributes), as
* well as the HTML content (inner HTML). You can use this String to remove
* or replace elements from the original HTML document.
*/
public String getOuterHTML(){
return outerHTML;
}
//**************************************************************************
//** getInnerText
//**************************************************************************
/** Removes all HTML tags and attributes inside this element, leaving the
* raw rendered text.
*/
public String getInnerText(){
return Parser.stripHTMLTags(innerHTML);
}
//**************************************************************************
//** getAttribute
//**************************************************************************
/** Returns the value for a given attribute. If no match is found, returns
* an empty string.
*/
public String getAttribute(String attributeName){
return _getAttributeValue(attributeName);
}
//**************************************************************************
//** getElementByID
//**************************************************************************
/** Returns an HTML Element with given a id. Returns null if the element was
* not found.
*/
public Element getElementByID(String id){
return getElementByAttributes(null, "id", id);
}
//**************************************************************************
//** getElementByTagName
//**************************************************************************
/** Returns an array of HTML Elements with given tag name.
*/
public Element[] getElementsByTagName(String tagName){
return getParser().getElementsByTagName(tagName);
}
//**************************************************************************
//** getElementByTagName
//**************************************************************************
/** Returns the first HTML Element with given tag name. Returns null if an
* element was not found.
*/
public Element getElementByTagName(String tagName){
return getElementByAttributes(tagName, null, null);
}
//**************************************************************************
//** getElements
//**************************************************************************
public Element[] getChildNodes(){
return getParser().getElements();
}
//**************************************************************************
//** getElements
//**************************************************************************
/** Returns an array of HTML Elements with given tag name, attribute, and
* attribute value (e.g. "div", "class", "panel-header").
*/
public Element[] getElements(String tagName, String attributeName, String attributeValue){
return getParser().getElements(tagName, attributeName, attributeValue);
}
//**************************************************************************
//** getElementByAttributes
//**************************************************************************
/** Returns the first HTML Element with given tag name and attribute. Returns
* null if an element was not found.
*/
public Element getElementByAttributes(String tagName, String attributeName, String attributeValue){
return getParser().getElementByAttributes(tagName, attributeName, attributeValue);
}
//**************************************************************************
//** getImageLinks
//**************************************************************************
/** Returns a list of links to images. The links may include relative paths.
* Use the Parser.getAbsolutePath() method to resolve the relative paths to
* a fully qualified url.
*/
public String[] getImageLinks(){
return getParser().getImageLinks();
}
//**************************************************************************
//** getParser
//**************************************************************************
private Parser getParser(){
if (parser==null) parser = new Parser(innerHTML);
return parser;
}
//**************************************************************************
//** toString
//**************************************************************************
/** Returns the outer HTML of this element. See getOuterHTML() for more
* information.
*/
public String toString(){
return outerHTML;
}
/** @deprecated Use getInnerText() */
public String stripHTMLTags(){
return getInnerText();
}
/** @deprecated Use getAttribute() */
public String getAttributeValue(String attributeName){
return getAttribute(attributeName);
}
//**************************************************************************
//** getAttributeValue
//**************************************************************************
/** Returns the value for a given attribute. If no match is found, returns
* an empty string.
*/
private String _getAttributeValue(String attributeName){
String tag = (nodeName + " " + attributes).trim();
try{
org.w3c.dom.Document XMLDoc = DOM.createDocument("<" + tag + "/>");
org.w3c.dom.NamedNodeMap attr = XMLDoc.getFirstChild().getAttributes();
return DOM.getAttributeValue(attr,attributeName);
}
catch(Exception e){
try{
return _getAttributeValue2(tag, attributeName);
}
catch(Exception ex){
return "";
}
}
}
private String _getAttributeValue2(String tag, String attributeName){
tag = tag.trim();
if (!tag.contains(" ")) return tag;
String orgTag = tag;
tag = tag.substring(tag.indexOf(" "), tag.length()).trim();
String tagName = orgTag + " ";
tagName = tagName.substring(0, tagName.indexOf(" "));
//compress spaces
String newTag = "";
tag += " ";
for (int i=0; i<tag.length()-1; i++){
char ch = tag.charAt(i);
if (ch==' ' && tag.charAt(i+1)==' '){
}
else{
newTag += ch;
}
}
newTag = newTag.replace("= ", "=");
newTag = newTag.replace(" =", "=");
//System.out.println("newTag = " + newTag);System.out.println();
newTag = " " + newTag + " ";
//attributeName = attributeName.toLowerCase();
for (int i=0; i<newTag.length(); i++){
char ch = newTag.charAt(i);
if (ch == '='){
String tmp = newTag.substring(0, i);
//System.out.println(tmp);
String AttrName = tmp.substring(tmp.lastIndexOf(" "), tmp.length()).trim();
String AttrValue = "";
//System.out.println("AttrName = " + AttrName);
if (AttrName.equalsIgnoreCase(attributeName)){
tmp = newTag.substring(i+1, newTag.length()).trim() + " ";
if (newTag.charAt(i+1)=='"'){
tmp = tmp.substring(1, tmp.length());
AttrValue = tmp.substring(0, tmp.indexOf("\""));
}
else if (newTag.charAt(i+1)=='\''){
tmp = tmp.substring(1, tmp.length());
AttrValue = tmp.substring(0, tmp.indexOf("'"));
}
else{
AttrValue = tmp.substring(0, tmp.indexOf(" "));
}
return AttrValue;
}
}
}
return "";
}
//**************************************************************************
//** isWhitespace
//**************************************************************************
/** Returns true if the given char is a white space, tab or return
*/
private static boolean isWhitespace(char c){
if (c==' ') return true;
if (c=='\r') return true;
if (c=='\n') return true;
if (c=='\t') return true;
return false;
}
}