JTidy转换html到xml

沈英勋

2023-12-01

JTidy转换html到xml

方法一：现无法解决乱码

package spide;
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.IOException;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import java.io.FileOutputStream;
/**
* A sample DOM writer. This sample program illustrates how to
* traverse a DOM tree in order to print a document that is parsed.
*
*/
public class TestDOM {
protected PrintWriter out;
public TestDOM() {
try
{
FileOutputStream outxml=new FileOutputStream("D:/test.xml");
out = new PrintWriter(outxml);
}
catch(Exception e)
{
  e.printStackTrace();
 }
}
/** Prints the specified node, recursively. */
public void print(Node node) {
if ( node == null ) {
return;
}
int type = node.getNodeType();
switch ( type ) {
case Node.DOCUMENT_NODE:

out.println("<?xml version=/"1.0/" encoding=/"GBK/"?>");
print(((Document)node).getDocumentElement());
out.flush();
break;
case Node.ELEMENT_NODE:
out.print('<');
out.print(node.getNodeName());
NamedNodeMap attrs = node.getAttributes();
for ( int i = 0; i < attrs.getLength(); i++ ) {
out.print(' ');
out.print(attrs.item(i).getNodeName());
out.print("=/"");
out.print(attrs.item(i).getNodeValue());
out.print('"');
}
out.print('>');
out.println(); // HACK
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
print(children.item(i));
}
}
break;
case Node.TEXT_NODE:
out.print(node.getNodeValue());
break;
}
if ( type == Node.ELEMENT_NODE ) {
out.print("</");
out.print(node.getNodeName());
out.print('>');
out.println(); // HACK
}
out.flush();
}
public static void main(String args[]) { 
 String conf="D:/tidy.properties";
FileInputStream in;

Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(conf);
TestDOM t = new TestDOM();
try {
in = new FileInputStream("D:/speed.html");
tidy.setMakeClean(true);
tidy.setXmlTags(true);
t.print(tidy.parseDOM(in, null));
}
catch ( IOException e ) {
System.err.println( e.toString() );
}
}
}

方法二：可以解决乱码，解析时出现 White spaces are required between publicId and systemId错误

package spide;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import org.w3c.tidy.Tidy;
public class Test17 implements Runnable {
    private String srcFileName;
    private String outFileName;
    private String errOutFileName;
    private String configFileName;
    public Test17(String srcFileName, String outFileName,??? String confName) {
          this.srcFileName = srcFileName;
          this.outFileName = outFileName;
          this.configFileName= confName;
     }
      public void run() {
          BufferedInputStream in;
          FileOutputStream out;
          Tidy tidy = new Tidy();
     tidy.setConfigurationFromFile(configFileName);
     try {
       // tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
                in = new BufferedInputStream(new FileInputStream(srcFileName));
                out = new FileOutputStream(outFileName);
               String head = "<?xml version=/"1.0/" encoding=/"GBK/"?>";
               byte[] bytes = head.getBytes();
               out.write(bytes, 0, bytes.length);

tidy.parse(in, out);
           } catch (IOException e) {
                    System.out.println(this.toString() + e.toString());
          }
      }
     public static void main(String[] args) {
     String src="D:/speed.html";
     String out="D:/result.xml";
     String err="D:/err.txt";
     String conf="D:/tidy.properties";
          Test17 t1 = new Test17(src,out,conf);
           Thread th1 = new Thread(t1);
           th1.start();
      }
}

JTidy转换html到xml

JTidy转换html到xml

相关阅读

相关文章

相关问答

相关文档