I need to have uploads of csvs that are up to 350MB with around 3,000,000 lines. I only need to check the header on the first line. Is there an efficient way to do this? Using node-csv take a long time because it's parsing the whole thing. I'm using busyboy and have a stream.
2 Answers
I've never had to parse a file so large but maybe you can try a combination of the event-stream and get-line packages:
var es, fs, getLine, getLines;
getLine = require('get-line');
fs = require('fs');
es = require('event-stream');
getLines = getLine({
lines: [1],
encoding: 'utf8'
});
console.time('get first line');
fs.createReadStream('./test.csv', { encoding: 'utf8' })
.pipe(getLines)
.pipe(es.map(function(line, next) {
var data = line.split(',').map(function(c) { return c.trim(); });
/* this will be called for each line, do your stuff here */
console.log(data);
return next(null, line);
})).pipe(es.wait(function(err, body) {
/* this is called after the processing of all lines, if you want to do something more */
/* can be removed if you don't need it */
console.timeEnd('get first line');
}));
Edit: just tried my code with a fat csv from here (the star2002 file, 2GB), here is the result:
[ '1',
'1613423',
'807',
'20011015.2226039991',
'1613424',
'4518',
'0',
'0',
'654',
'1395',
'20011204.1149509996',
'10.955403',
'2288071',
'-0.28820264',
'0.40731233',
'10.559091' ]
get first line: 15ms
1 Comment
jabbermonkey
The get-line module does what I need and is much faster than needlessly parsing the 3 million lines. I'll use it for now but it seems like there should be simpler way.
The problem might arise with the method you used node-csv package. In the samples they have a sample of parsing large csv file as a stream. Here is the sample from github:
var csv = require('..');
var i = 0
var generator = csv.generate({seed: 1, columns: 2, length: 20});
var parser = csv.parse();
var transformer = csv.transform(function(data){
i++
return data.map(function(value){return value.toUpperCase()});
});
var stringifier = csv.stringify();
generator.on('readable', function(){
while(data = generator.read()){
parser.write(data);
}
});
generator.on('end', function(){
parser.end()
});
parser.on('readable', function(){
while(data = parser.read()){
transformer.write(data);
}
});
parser.on('end', function(){
transformer.end()
});
transformer.on('readable', function(){
while(data = transformer.read()){
stringifier.write(data);
}
});
transformer.on('end', function(){
stringifier.end();
});
stringifier.on('readable', function(){
while(data = stringifier.read()){
process.stdout.write(data);
}
});
generator.on('end', function(){
process.stdout.write('=> ' + i + ' records\n');
});
1 Comment
jabbermonkey
I tried that but to read through each line, even doing nothing else is just too slow and pointless.