Spring 读取具有 1M 行和 900Mo 大小的 csv 文件时出现 Batch OutOfMemoryException
Spring Batch OutOfMemoryException when reading a csv file with 1M rows and 900Mo size
我正在尝试使用 FlatFileItemReader
读取一个包含超过 100 万行的非常大的 CSV 文件,但是当启动我的批处理作业时,我在大约 10 分钟后得到一个 OutOfMemoryException
。
这是我的代码:
@Slf4j
@Configuration
@EnableBatchProcessing
@ComponentScan({
"f.p.f.batch",
"f.p.f.batch.tasklet"
})
public class BatchConfig {
@Autowired
private StepBuilderFactory steps;
@Autowired
private JobBuilderFactory jobBuilderFactory;
@Autowired
private DemoTasklet demoTasklet;
@Bean
public ResourcelessTransactionManager transactionManager() {
return new ResourcelessTransactionManager();
}
@Bean
public JobRepository jobRepository(ResourcelessTransactionManager transactionManager) {
MapJobRepositoryFactoryBean mapJobRepositoryFactoryBean = new MapJobRepositoryFactoryBean(transactionManager);
mapJobRepositoryFactoryBean.setTransactionManager(transactionManager);
try {
return mapJobRepositoryFactoryBean.getObject();
} catch (Exception ex) {
log.error("Exception : {}", ex.getMessage(), ex);
return null;
}
}
@Bean
//@StepScope
public FlatFileItemReader<Balance> csvAnimeReader() {
FlatFileItemReader<Balance> reader = new FlatFileItemReader<>();
DefaultLineMapper lineMapper = new DefaultLineMapper();
FieldSetMapper fieldSetMapper = new BalanceFieldSetMapper();
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
tokenizer.setNames(new String[]{
"EXER",
"IDENT",
"NDEPT",
"LBUDG",
"INSEE",
"SIREN",
"CREGI",
"NOMEN",
"CTYPE",
"CSTYP",
"CACTI",
"FINESS",
"SECTEUR",
"CBUDG",
"CODBUD1",
"COMPTE ",
"BEDEB",
"BECRE",
"OBNETDEB",
"OBNETCRE",
"ONBDEB",
"ONBCRE",
"OOBDEB",
"OOBCRE",
"SD",
"SC"
});
tokenizer.setDelimiter(";");
lineMapper.setLineTokenizer(tokenizer);
lineMapper.setFieldSetMapper(fieldSetMapper);
reader.setLineMapper(lineMapper);
reader.setResource(new ClassPathResource("Balance_Exemple_2016.csv"));
reader.setLinesToSkip(1);
return reader;
}
@Bean
public ItemProcessor<Balance, Balance> CsvFileProcessor() {
return new BalanceProcessor();
}
@Bean
public BalanceWriter balanceWriter() {
return new BalanceWriter();
}
@Bean
public SimpleJobLauncher jobLauncher(JobRepository jobRepository) {
SimpleJobLauncher simpleJobLauncher = new SimpleJobLauncher();
simpleJobLauncher.setJobRepository(jobRepository);
return simpleJobLauncher;
}
@Bean
public Step step1() {
return steps.get("step1")
.<Balance, Balance>chunk(1)
.reader(csvAnimeReader())
.writer(balanceWriter())
.build();
}
@Bean
public Step step2() {
return steps.get("step2")
.tasklet(demoTasklet)
.build();
}
@Bean
public Job readCsvJob() {
return jobBuilderFactory.get("readCsvJob")
.incrementer(new RunIdIncrementer())
.flow(step1())
.next(step2())
.end()
.build();
}
}
我建议你增加你的 JVM 最大内存,默认情况下它非常低(远低于 900Mo)以增加 VM args 添加 vm args 参数 -Xmx4g 以获得 4Go 作为最大 jvm 内存
您可以在此处找到有关默认值 xmx 的所有文档 https://docs.oracle.com/cd/E13150_01/jrockit_jvm/jrockit/jrdocs/refman/optionX.html
如果你 运行 它在 cmd 行 java -jar myprog.jat --vmargs-Xmx4g
如果你在 eclipse 中 运行 ->运行 配置 -> select 你使用的然后在选项卡 "Arguments" 在 vmargs 文本区域添加 -Xmx4G
我建议你使用流式传输,因为你永远不想一次读取所有文件,这是一个主要问题。
here 是一篇很好的文章,如何更有效地读取文件而不占用你的全部内存 space
我正在尝试使用 FlatFileItemReader
读取一个包含超过 100 万行的非常大的 CSV 文件,但是当启动我的批处理作业时,我在大约 10 分钟后得到一个 OutOfMemoryException
。
这是我的代码:
@Slf4j
@Configuration
@EnableBatchProcessing
@ComponentScan({
"f.p.f.batch",
"f.p.f.batch.tasklet"
})
public class BatchConfig {
@Autowired
private StepBuilderFactory steps;
@Autowired
private JobBuilderFactory jobBuilderFactory;
@Autowired
private DemoTasklet demoTasklet;
@Bean
public ResourcelessTransactionManager transactionManager() {
return new ResourcelessTransactionManager();
}
@Bean
public JobRepository jobRepository(ResourcelessTransactionManager transactionManager) {
MapJobRepositoryFactoryBean mapJobRepositoryFactoryBean = new MapJobRepositoryFactoryBean(transactionManager);
mapJobRepositoryFactoryBean.setTransactionManager(transactionManager);
try {
return mapJobRepositoryFactoryBean.getObject();
} catch (Exception ex) {
log.error("Exception : {}", ex.getMessage(), ex);
return null;
}
}
@Bean
//@StepScope
public FlatFileItemReader<Balance> csvAnimeReader() {
FlatFileItemReader<Balance> reader = new FlatFileItemReader<>();
DefaultLineMapper lineMapper = new DefaultLineMapper();
FieldSetMapper fieldSetMapper = new BalanceFieldSetMapper();
DelimitedLineTokenizer tokenizer = new DelimitedLineTokenizer();
tokenizer.setNames(new String[]{
"EXER",
"IDENT",
"NDEPT",
"LBUDG",
"INSEE",
"SIREN",
"CREGI",
"NOMEN",
"CTYPE",
"CSTYP",
"CACTI",
"FINESS",
"SECTEUR",
"CBUDG",
"CODBUD1",
"COMPTE ",
"BEDEB",
"BECRE",
"OBNETDEB",
"OBNETCRE",
"ONBDEB",
"ONBCRE",
"OOBDEB",
"OOBCRE",
"SD",
"SC"
});
tokenizer.setDelimiter(";");
lineMapper.setLineTokenizer(tokenizer);
lineMapper.setFieldSetMapper(fieldSetMapper);
reader.setLineMapper(lineMapper);
reader.setResource(new ClassPathResource("Balance_Exemple_2016.csv"));
reader.setLinesToSkip(1);
return reader;
}
@Bean
public ItemProcessor<Balance, Balance> CsvFileProcessor() {
return new BalanceProcessor();
}
@Bean
public BalanceWriter balanceWriter() {
return new BalanceWriter();
}
@Bean
public SimpleJobLauncher jobLauncher(JobRepository jobRepository) {
SimpleJobLauncher simpleJobLauncher = new SimpleJobLauncher();
simpleJobLauncher.setJobRepository(jobRepository);
return simpleJobLauncher;
}
@Bean
public Step step1() {
return steps.get("step1")
.<Balance, Balance>chunk(1)
.reader(csvAnimeReader())
.writer(balanceWriter())
.build();
}
@Bean
public Step step2() {
return steps.get("step2")
.tasklet(demoTasklet)
.build();
}
@Bean
public Job readCsvJob() {
return jobBuilderFactory.get("readCsvJob")
.incrementer(new RunIdIncrementer())
.flow(step1())
.next(step2())
.end()
.build();
}
}
我建议你增加你的 JVM 最大内存,默认情况下它非常低(远低于 900Mo)以增加 VM args 添加 vm args 参数 -Xmx4g 以获得 4Go 作为最大 jvm 内存
您可以在此处找到有关默认值 xmx 的所有文档 https://docs.oracle.com/cd/E13150_01/jrockit_jvm/jrockit/jrdocs/refman/optionX.html
如果你 运行 它在 cmd 行 java -jar myprog.jat --vmargs-Xmx4g
如果你在 eclipse 中 运行 ->运行 配置 -> select 你使用的然后在选项卡 "Arguments" 在 vmargs 文本区域添加 -Xmx4G
我建议你使用流式传输,因为你永远不想一次读取所有文件,这是一个主要问题。
here 是一篇很好的文章,如何更有效地读取文件而不占用你的全部内存 space