I'm scratching my head on a CSV file I cannot parse correctly, due to many errors. I extracted a sample you can download here: Test CSV File
Main errors (or what generated an error) are:
- Quotes & commas (many errors when trying to parse the file with R)
- Empty rows
- Unexpected line break inside a field
I first decided to use Regular Expression line by line to clean the data before loading them into R but couldn't solve the problem and it was two slow (200Mo file)
So I decided to use a CSV parser under Node.js with the following code:
'use strict';
const Fs = require('fs');
const Csv = require('csv');
let input = 'data_stack.csv';
let readStream = Fs.createReadStream(input);
let option = {delimiter: ',', quote: '"', escape: '"', relax: true};
let parser = Csv.parse(option).on('data', (data) => {
console.log(data)
});
readStream.pipe(parser)
But:
- Some rows are parsed correctly (array of strings)
- Some are not parsed (all fields are one string)
- Some rows are still empty (can be solve by adding
skip_empty_lines: trueto the options) - I don't know how to handle the unexpected line break.
I don't know how to make this CSV clean, neither with R nor with Node.js.
Any help?
EDIT:
Following @Danny_ds solution, I can parse it correctly. Now I cannot stringify it back correctly.
with console.log(); I get a proper object but when I'm trying to stringify it, I don't get a clean CSV (still have line break and empty rows).
Here is the code I'm using:
'use strict';
const Fs = require('fs');
const Csv = require('csv');
let input = 'data_stack.csv';
let output = 'data_output.csv';
let readStream = Fs.createReadStream(input);
let writeStream = Fs.createWriteStream(output);
let opt = {delimiter: ',', quote: '"', escape: '"', relax: true, skip_empty_lines: true};
let transformer = Csv.transform(data => {
let dirty = data.toString();
let replace = dirty.replace(/\r\n"/g, '\r\n').replace(/"\r\n/g, '\r\n').replace(/""/g, '"');
return replace;
});
let parser = Csv.parse(opt);
let stringifier = Csv.stringify();
readStream.pipe(transformer).pipe(parser).pipe(stringifier).pipe(writeStream);
EDIT 2:
Here is the final code that works:
'use strict';
const Fs = require('fs');
const Csv = require('csv');
let input = 'data_stack.csv';
let output = 'data_output.csv';
let readStream = Fs.createReadStream(input);
let writeStream = Fs.createWriteStream(output);
let opt = {delimiter: ',', quote: '"', escape: '"', relax: true, skip_empty_lines: true};
let transformer = Csv.transform(data => {
let dirty = data.toString();
let replace = dirty
.replace(/\r\n"/g, '\r\n')
.replace(/"\r\n/g, '\r\n')
.replace(/""/g, '"');
return replace;
});
let parser = Csv.parse(opt);
let cleaner = Csv.transform(data => {
let clean = data.map(l => {
if (l.length > 100 || l[0] === '+') {
return l = "Encoding issue";
}
return l;
});
return clean;
});
let stringifier = Csv.stringify();
readStream.pipe(transformer).pipe(parser).pipe(cleaner).pipe(stringifier).pipe(writeStream);
Thanks to everyone!

