1.根据DTD文件构造HTML树
2.遍历出网页中所有肉眼能看到的东西
3.找出DIV块文本、找出TABLE块文本
4.根据DTD中的实体定义,将HTML转义符转成原有的字符
下一步计划,完成分词模块
大杂烩
通过2组引擎数组,完成对html网页的粗糙预处理
- final String[] blankFilter = {" \t", " \t", "\n", " \t\r\n", "<", " \t"};
意思:如果已经扫描到空格或者tab,那么后面不要出现空格或tab了
“\n”, ” \t\r\n” 同义
“<", " \t" 同义
- final String[] tagFilters = {"<!–", "–>", "<script", "</script>", "<style", "</style>", "<!DOCTYPE", ">"};
意思,删除script开头,script结尾的所有内容
- private Reader reader;
- public void setReader(Reader reader){this.reader = new BufferedReader(reader);}
-
- char[] buf = null;
- int bufEnd = 0;
-
- /*
- * 将reader中的内容存入500K的缓存中,如果不够采取加倍策略
- * read all content from reader into buf and update the bufEnd,
- * finally will close the reader */
- boolean readyData() throws IOException, NotSetReaderException
- {
- if(reader == null)
- {
- throw new NotSetReaderException();
- }
-
- /* for most page 500K is enough */
- int MOST_PAGE_SIZE = 500 * 1024;
-
- /* do not bigger than 5M */
- int MAX_PAGE_SIZE = 5 * 1024 * 1024;
-
- buf = new char[MOST_PAGE_SIZE];
-
- try {
- int actualRead;
- while((actualRead = reader.read(buf, bufEnd, MOST_PAGE_SIZE-bufEnd)) != -1)
- {
- /* oh, fill all space, so there are something remain in stream */
- if(actualRead == MOST_PAGE_SIZE – bufEnd)
- {
- /* update the bufEnd */
- bufEnd += actualRead;
-
- /* it’s enough */
- if(bufEnd >= MAX_PAGE_SIZE)
- break;
-
- /* assign bigger array */
- char[] tempBuf = new char[MOST_PAGE_SIZE * 2];
- MOST_PAGE_SIZE = MOST_PAGE_SIZE * 2;
- System.arraycopy(buf, 0, tempBuf, 0, bufEnd);
- buf = tempBuf;
- }
- /* nothing to read */
- else
- {
- /* update the bufEnd */
- bufEnd += actualRead;
- }
- }
-
- } catch (IOException e) {
- System.out.println("God said: something wrong when read the page in initialize");
- throw e;
- } finally {
- reader.close();
- }
-
- /* test for the buff
- FileWriter writer = new FileWriter("11");
- writer.write(buf,0, bufEnd);
- writer.close();
- */
-
- return true;
- }
-
-
- void removeUselessContent()
- {
- int saved = 0, reading = 0;
-
- /* 前者为条件(空格tab),后者为删除对象(空格tab) */
- final String[] blankFilter = {" \t", " \t", "\n", " \t\r\n", "<", " \t"};
-
- /* 准备删除的无用标签,提供首末信息 */
- final String[] tagFilters = {"<!–", "–>", "<script", "</script>", "<style", "</style>", "<!DOCTYPE", ">"};
-
- /* 无回溯高效扫描整个文档 */
- while(true)
- {
- if(reading >= bufEnd)
- break;
-
- /* 一直扫描到没有过滤为止 */
- while(true)
- {
- /* 监控是否有符合的过滤,只有当没有什么可以过滤了,
- * 才退出循环,标志为reading是否有读 */
- int checkFilter = reading;
-
- /* 过滤篇首的所有空格 */
- while(saved == 0 && reading < bufEnd &&
- (buf[reading] == ‘ ‘ || buf[reading] == ‘\t‘ ||
- buf[reading] == ‘\r‘ || buf[reading] == ‘\n‘))
- {
- reading++;
- }
-
- reading = removeUselessBlank(blankFilter, reading, saved);
-
- reading = removeHtmlTag(tagFilters, reading);
-
- /* 如果有reading有遇到过滤的条件,让其继续扫描过滤 */
- if(checkFilter == reading)
- break;
- }
-
- if(reading < bufEnd)
- buf[saved++] = buf[reading++];
-
- }
-
- bufEnd = saved;
-
- ///* test for the buff
- try{
- FileWriter writer = new FileWriter("11");
- writer.write(buf,0, bufEnd);
- writer.close();
- //*/
- }catch(IOException e){}
- }
-
-
- private int removeUselessBlank(String[] filterString, int reading, int saved)
- {
- char[] condition, forbidance;
- for(int i = 0; i < filterString.length;)
- {
- condition = filterString[i++].toCharArray();
- forbidance = filterString[i++].toCharArray();
- while (saved – 1 >= 0 && equalAny(condition, buf[saved - 1])
- && equalAny(forbidance, buf[reading])) {
- reading++;
- }
- }
- return reading;
- }
-
- private boolean equalAny(char[] condition, char aim) {
- boolean bool = false;
- for (char c : condition) {
- bool = bool || (c == aim);
- }
- return bool;
- }
-
- private int removeHtmlTag(String[] filterString, int reading) {
-
- char[] styleBegin, styleEnd;
- for (int i = 0; i < filterString.length;) {
- styleBegin = filterString[i++].toCharArray();
- styleEnd = filterString[i++].toCharArray();
-
- while (reading + styleBegin.length < bufEnd && compareCharArray(buf, reading, styleBegin))
- {
- reading = reading + styleBegin.length;
- while (reading + styleEnd.length < bufEnd && !compareCharArray(buf, reading, styleEnd))
- {
- reading++;
- }
- reading = reading + styleEnd.length;
- }
- }
-
- return reading;
- }
-
- private boolean compareCharArray(char[] src, int from, char[] des) {
- int i = 0;
- while (i < des.length
- && (src[from] == des[i] || (src[from] + 32) == des[i])) {
- from++;
- i++;
- }
- return i == des.length;
- }
大杂烩