解析数百万 XML 个文件 - Java
Parsing millions of XML files - Java
我正在研究解析 xml 技术并决定使用 SAX 而不是 DOM 解析器。数据,数百万 xml 个文件,每个文件近 6KB。我正在使用 SAXparser。
我一个一个循环遍历所有调用 parser.parse(file,handler) 的文件,但在 100,000 次之后,我收到堆内存不足错误。当我试图转储我的堆并读取它时,我看到存储了很多字符数组和字符串。
问题是,我如何在没有堆错误的情况下解析数百万个小文件。
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import java.util.*;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author Ajinkya Jumbad
*/
public class dataset {
static List<String> cols;
public HashMap<String, HashMap> hm = new HashMap<>();
static int i =0;
dataset() {
String coln[] = {
"UID",
"Name",
"NationID",
"Born",
"Age",
"IntCaps",
"IntGoals",
"U21Caps",
"U21Goals",
"Height",
"Weight",
"AerialAbility",
"CommandOfArea",
"Communication",
"Eccentricity",
"Handling",
"Kicking",
"OneOnOnes",
"Reflexes",
"RushingOut",
"TendencyToPunch",
"Throwing",
"Corners",
"Crossing",
"Dribbling",
"Finishing",
"FirstTouch",
"Freekicks",
"Heading",
"LongShots",
"Longthrows",
"Marking",
"Passing",
"PenaltyTaking",
"Tackling",
"Technique",
"Aggression",
"Anticipation",
"Bravery",
"Composure",
"Concentration",
"Vision",
"Decisions",
"Determination",
"Flair",
"Leadership",
"OffTheBall",
"Positioning",
"Teamwork",
"Workrate",
"Acceleration",
"Agility",
"Balance",
"Jumping",
"LeftFoot",
"NaturalFitness",
"Pace",
"RightFoot",
"Stamina",
"Strength",
"Consistency",
"Dirtiness",
"ImportantMatches",
"InjuryProness",
"Versatility",
"Adaptability",
"Ambition",
"Loyalty",
"Pressure",
"Professional",
"Sportsmanship",
"Temperament",
"Controversy",
"PositionsDesc",
"Goalkeeper",
"Sweeper",
"Striker",
"AttackingMidCentral",
"AttackingMidLeft",
"AttackingMidRight",
"DefenderCentral",
"DefenderLeft",
"DefenderRight",
"DefensiveMidfielder",
"MidfielderCentral",
"MidfielderLeft",
"MidfielderRight",
"WingBackLeft",
"WingBackRight"};
cols = Arrays.asList(coln);
try {
File f = new File("C:\Users\Ajinkya Jumbad\Desktop\fmdata");
//File files[] = f.listFiles();
for (File file : f.listFiles()) {
//System.out.println(file.getAbsolutePath());
if (file.isFile()) {
parse p = new parse(file);
}
}
//savefile();
} catch (Exception ex) {
Logger.getLogger(dataset.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void savefile() {
try {
String file_name = "dataset.csv";
FileWriter w = new FileWriter(file_name);
writecsv ws = new writecsv();
boolean first = true;
StringBuilder sb = new StringBuilder();
for (String key : cols) {
if (!first) {
sb.append(",");
}
sb.append(key);
first = false;
}
sb.append("\n");
w.append(sb.toString());
for (String uid : hm.keySet()) {
ws.writeLine(w, hm.get(uid));
}
w.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public class parse{
parse(File file){
try {
SAXParserFactory parserfac = SAXParserFactory.newInstance();
parserfac.setNamespaceAware(true);
SAXParser parser = parserfac.newSAXParser();
DefaultHandler handler = new DefaultHandler(){
HashMap<String, String> ht;
@Override
public void startDocument() {
ht = new HashMap<>();
}
@Override
public void startElement(String namespaceURI,
String localName,
String qName,
Attributes atts) {
if (atts.getValue("Value") != null && cols.contains(localName)) {
//System.out.println(localName);
String key = localName;
ht.put(key, atts.getValue("Value"));
}
}
@Override
public void endDocument() {
String uid = ht.get("UID");
hm.put(uid, ht);
dataset.i += 1;
if(dataset.i%100 == 0){
System.out.println(dataset.i);
}
}
@Override
public void characters(char ch[], int start, int length) throws SAXException {
}
};
parser.parse(file, handler);
} catch (Exception ex) {
Logger.getLogger(dataset.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
public static void main(String[] args) {
dataset ds = new dataset();
}
}
一个;完成后关闭文件。
B;如果它仍然发生,请跟踪可用内存并调用 gc()。有点骇人听闻,但如果可行的话......
C;如果你有访问多个线程,运行这个就尽可能多;给每个线程一个编号 N 并让它处理每 N 个文件。
首先,重用 SAXParserFactory 和解析器本身。创建一个 SAXParserFactory 可能非常昂贵,而创建一个解析器也不便宜。总而言之,这些操作可能比实际解析输入花费的时间要长得多。但那是为了节省时间,而不是内存。
就内存而言,我怀疑 space 全部被您自己的数据结构占用:特别是您将结果放入其中的 HashMap。尝试使用 JVisualVM 检查堆以确认这一点。
至于底线,"how do I parse this data without running out of memory",这完全取决于您要对数据执行的操作。没有人为了好玩而解析 XML 数据;您这样做是因为您想将数据用于某种目的。如果不了解更多关于 (a) 您想对数据做什么,以及 (b) 体积(您已经给了我们一个广泛的规模指示:但您应该能够告诉我们您希望此 HashMap 包含多少条目,以及条目有多大)。
还有一件显而易见的小事,以防你没有意识到:使用 Java 命令行上的 -Xmx 选项来控制可用堆 space 的数量。
我正在研究解析 xml 技术并决定使用 SAX 而不是 DOM 解析器。数据,数百万 xml 个文件,每个文件近 6KB。我正在使用 SAXparser。
我一个一个循环遍历所有调用 parser.parse(file,handler) 的文件,但在 100,000 次之后,我收到堆内存不足错误。当我试图转储我的堆并读取它时,我看到存储了很多字符数组和字符串。
问题是,我如何在没有堆错误的情况下解析数百万个小文件。
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import java.util.*;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author Ajinkya Jumbad
*/
public class dataset {
static List<String> cols;
public HashMap<String, HashMap> hm = new HashMap<>();
static int i =0;
dataset() {
String coln[] = {
"UID",
"Name",
"NationID",
"Born",
"Age",
"IntCaps",
"IntGoals",
"U21Caps",
"U21Goals",
"Height",
"Weight",
"AerialAbility",
"CommandOfArea",
"Communication",
"Eccentricity",
"Handling",
"Kicking",
"OneOnOnes",
"Reflexes",
"RushingOut",
"TendencyToPunch",
"Throwing",
"Corners",
"Crossing",
"Dribbling",
"Finishing",
"FirstTouch",
"Freekicks",
"Heading",
"LongShots",
"Longthrows",
"Marking",
"Passing",
"PenaltyTaking",
"Tackling",
"Technique",
"Aggression",
"Anticipation",
"Bravery",
"Composure",
"Concentration",
"Vision",
"Decisions",
"Determination",
"Flair",
"Leadership",
"OffTheBall",
"Positioning",
"Teamwork",
"Workrate",
"Acceleration",
"Agility",
"Balance",
"Jumping",
"LeftFoot",
"NaturalFitness",
"Pace",
"RightFoot",
"Stamina",
"Strength",
"Consistency",
"Dirtiness",
"ImportantMatches",
"InjuryProness",
"Versatility",
"Adaptability",
"Ambition",
"Loyalty",
"Pressure",
"Professional",
"Sportsmanship",
"Temperament",
"Controversy",
"PositionsDesc",
"Goalkeeper",
"Sweeper",
"Striker",
"AttackingMidCentral",
"AttackingMidLeft",
"AttackingMidRight",
"DefenderCentral",
"DefenderLeft",
"DefenderRight",
"DefensiveMidfielder",
"MidfielderCentral",
"MidfielderLeft",
"MidfielderRight",
"WingBackLeft",
"WingBackRight"};
cols = Arrays.asList(coln);
try {
File f = new File("C:\Users\Ajinkya Jumbad\Desktop\fmdata");
//File files[] = f.listFiles();
for (File file : f.listFiles()) {
//System.out.println(file.getAbsolutePath());
if (file.isFile()) {
parse p = new parse(file);
}
}
//savefile();
} catch (Exception ex) {
Logger.getLogger(dataset.class.getName()).log(Level.SEVERE, null, ex);
}
}
private void savefile() {
try {
String file_name = "dataset.csv";
FileWriter w = new FileWriter(file_name);
writecsv ws = new writecsv();
boolean first = true;
StringBuilder sb = new StringBuilder();
for (String key : cols) {
if (!first) {
sb.append(",");
}
sb.append(key);
first = false;
}
sb.append("\n");
w.append(sb.toString());
for (String uid : hm.keySet()) {
ws.writeLine(w, hm.get(uid));
}
w.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
public class parse{
parse(File file){
try {
SAXParserFactory parserfac = SAXParserFactory.newInstance();
parserfac.setNamespaceAware(true);
SAXParser parser = parserfac.newSAXParser();
DefaultHandler handler = new DefaultHandler(){
HashMap<String, String> ht;
@Override
public void startDocument() {
ht = new HashMap<>();
}
@Override
public void startElement(String namespaceURI,
String localName,
String qName,
Attributes atts) {
if (atts.getValue("Value") != null && cols.contains(localName)) {
//System.out.println(localName);
String key = localName;
ht.put(key, atts.getValue("Value"));
}
}
@Override
public void endDocument() {
String uid = ht.get("UID");
hm.put(uid, ht);
dataset.i += 1;
if(dataset.i%100 == 0){
System.out.println(dataset.i);
}
}
@Override
public void characters(char ch[], int start, int length) throws SAXException {
}
};
parser.parse(file, handler);
} catch (Exception ex) {
Logger.getLogger(dataset.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
public static void main(String[] args) {
dataset ds = new dataset();
}
}
一个;完成后关闭文件。
B;如果它仍然发生,请跟踪可用内存并调用 gc()。有点骇人听闻,但如果可行的话......
C;如果你有访问多个线程,运行这个就尽可能多;给每个线程一个编号 N 并让它处理每 N 个文件。
首先,重用 SAXParserFactory 和解析器本身。创建一个 SAXParserFactory 可能非常昂贵,而创建一个解析器也不便宜。总而言之,这些操作可能比实际解析输入花费的时间要长得多。但那是为了节省时间,而不是内存。
就内存而言,我怀疑 space 全部被您自己的数据结构占用:特别是您将结果放入其中的 HashMap。尝试使用 JVisualVM 检查堆以确认这一点。
至于底线,"how do I parse this data without running out of memory",这完全取决于您要对数据执行的操作。没有人为了好玩而解析 XML 数据;您这样做是因为您想将数据用于某种目的。如果不了解更多关于 (a) 您想对数据做什么,以及 (b) 体积(您已经给了我们一个广泛的规模指示:但您应该能够告诉我们您希望此 HashMap 包含多少条目,以及条目有多大)。
还有一件显而易见的小事,以防你没有意识到:使用 Java 命令行上的 -Xmx 选项来控制可用堆 space 的数量。