将 7M 行的 CSV 文件解析为 Java 对象时内存不足

OutOfMemory when parsing a CSV file of 7M lines into Java Object

我正在逐行提取一个包含超过 7M 行的 CSV 文件,占用磁盘空间超过 1Gig space。

读取 List<String> 的操作很好,不到 2 分钟即可完成。 但问题是当我尝试在这个列表上循环并将每一行映射到一个对象 Balance 然后我创建了一个 OuyOfMemoryException:

01:00:30.664 [restartedMain] ERROR org.springframework.batch.core.step.AbstractStep - Encountered an error executing step readInputStep in job readCsvJob
java.lang.OutOfMemoryError: GC overhead limit exceeded
    at java.lang.AbstractStringBuilder.<init>(AbstractStringBuilder.java:68) ~[?:1.8.0_172]
    at java.lang.StringBuffer.<init>(StringBuffer.java:128) ~[?:1.8.0_172]
    at java.text.DigitList.getStringBuffer(DigitList.java:804) ~[?:1.8.0_172]
    at java.text.DigitList.getDouble(DigitList.java:164) ~[?:1.8.0_172]
    at java.text.DecimalFormat.parse(DecimalFormat.java:2089) ~[?:1.8.0_172]
    at java.text.NumberFormat.parse(NumberFormat.java:383) ~[?:1.8.0_172]
    at fr.payet.flad.batch.mapper.BalanceLineMapper.parseToDouble(BalanceLineMapper.java:56) ~[classes/:?]
    at fr.payet.flad.batch.mapper.BalanceLineMapper.toBalance(BalanceLineMapper.java:40) ~[classes/:?]
    at fr.payet.flad.batch.tasklet.ReadInputTasklet.execute(ReadInputTasklet.java:56) ~[classes/:?]

这是我的 BalanceLineMapper 代码:

@Component
@Slf4j
public class BalanceLineMapper {

    public Balance toBalance(String[] ligneCsv, int cursorIndex) {
        try {
            return Balance.builder()
                    .index(cursorIndex)
                    .exer(ligneCsv[0])
                    .ident(ligneCsv[1])
                    .nDept(ligneCsv[2])
                    .lBudg(ligneCsv[3])
                    .insee(ligneCsv[4])
                    .siren(ligneCsv[5])
                    .cRegi(ligneCsv[6])
                    .nomen(ligneCsv[7])
                    .cType(ligneCsv[8])
                    .cstyp(ligneCsv[9])
                    .cActi(ligneCsv[10])
                    .finess(ligneCsv[11])
                    .secteur(ligneCsv[12])
                    .cBudg(ligneCsv[13])
                    .codBud1(ligneCsv[14])
                    .compte(ligneCsv[15])
                    .BEDeb(ligneCsv[16])
                    .BECre(parseToDouble(ligneCsv[17]))
                    .OBNetDeb(parseToDouble(ligneCsv[18]))
                    .OBNetCre(parseToDouble(ligneCsv[19]))
                    .ONBDeb(parseToDouble(ligneCsv[20]))
                    .ONBCre(parseToDouble(ligneCsv[21]))
                    .OOBDeb(parseToDouble(ligneCsv[22]))
                    .OOBCre(parseToDouble(ligneCsv[23]))
                    .sd(parseToDouble(ligneCsv[24]))
                    .sc(parseToDouble(ligneCsv[25]))
                    .build();
        } catch (NumberFormatException e) {
            log.debug("Erreur lors de du casting");
        }
        return null;
    }

    private Double parseToDouble(String number){
        NumberFormat format = NumberFormat.getInstance(Locale.FRANCE);
        try {
             return format.parse(number).doubleValue();
        }catch (ParseException e){
            log.error("Erreur de parsing de {} en Java Double", number, e.getMessage(), e);
        }
        log.error("parseToDouble retourne la valeur NULL");
        return null;
    }

}

和 ReadInputTasklet 代码:

@Slf4j
@Component
public class ReadInputTasklet implements Tasklet, StepExecutionListener {

    @Autowired
    BalanceLineMapper balanceLineMapper;

    @Override
    public RepeatStatus execute(StepContribution stepContribution, ChunkContext chunkContext) throws Exception {
        List<Balance> balances = Lists.newArrayList();
        List<String> balancesList = Lists.newArrayList();
        try {
            CSVReader reader = new CSVReader(new FileReader("/Users/ghassen/Desktop/FLAD/Balance_Commune_2016.csv"), '\n');
            String[] nextLine;
            int cursorIndex = 0;
            while ((nextLine = reader.readNext()) != null) {
                if (cursorIndex != 0){
                    balancesList.add(nextLine[0]);
                    log.debug("{} balance(s) ajoutée(s) dans la liste ...", balancesList.size());
                }
                cursorIndex++;
            }
            log.debug("Lecture de toutes les lignes terminé");

            log.debug("Parsing de toutes les lignes");
            for (String line : balancesList){
                String[] lineSeperated = StringUtils.splitByWholeSeparatorPreserveAllTokens(line,";");
                balances.add(balanceLineMapper.toBalance(lineSeperated, cursorIndex));
            }
            log.debug("Job terminé");
        } catch (IOException e) {
            log.error("File not found", e);
        }
        return RepeatStatus.FINISHED;
    }

    @Override
    public void beforeStep(StepExecution stepExecution) {

    }

    @Override
    public ExitStatus afterStep(StepExecution stepExecution) {
        return null;
    }
}

您在短时间内创建了大量实例(包括您稍后解析的字符串),垃圾收集器无法跟上。我建议您以流式设计构建整个系统,并且只解析您实际需要的那些。

我同意@AUser。但是,让我更具体一点。您可以将 parseToDouble 函数替换为标准 Double.valueOf()。它应该更有效率。