1

I want to create a dataframe for all the csv files within my input_path. My code is raising UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte error. I've also tried calling read_csv with encoding='latin1', encoding='iso-8859-1' or encoding='cp1252'.

import os
import pandas as pd

input_path = "../input_data/"


# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            df = pd.read_csv(data, encoding='utf_8')

Traceback:

> --------------------------------------------------------------------------- UnicodeDecodeError                        Traceback (most recent call
> last) /tmp/ipykernel_136/3748812978.py in <module>
>       3     for file in files:
>       4         with open(os.path.join(root, file), "r") as data:
> ----> 5             df = pd.read_csv(data, encoding='utf_8')
> 
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
>     309                     stacklevel=stacklevel,
>     310                 )
> --> 311             return func(*args, **kwargs)
>     312 
>     313         return wrapper
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,
> usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters,
> true_values, false_values, skipinitialspace, skiprows, skipfooter,
> nrows, na_values, keep_default_na, na_filter, verbose,
> skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
> date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
> thousands, decimal, lineterminator, quotechar, quoting, doublequote,
> escapechar, comment, encoding, encoding_errors, dialect,
> error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
> low_memory, memory_map, float_precision, storage_options)
>     584     kwds.update(kwds_defaults)
>     585 
> --> 586     return _read(filepath_or_buffer, kwds)
>     587 
>     588 
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _read(filepath_or_buffer, kwds)
>     480 
>     481     # Create the parser.
> --> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
>     483 
>     484     if chunksize or iterator:
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> __init__(self, f, engine, **kwds)
>     809             self.options["has_index_names"] = kwds["has_index_names"]
>     810 
> --> 811         self._engine = self._make_engine(self.engine)
>     812 
>     813     def close(self):
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _make_engine(self, engine)    1038             )    1039         # error: Too many arguments for "ParserBase"
> -> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]    1041     1042     def
> _failover_to_python(self):
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py
> in __init__(self, src, **kwds)
>      67         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
>      68         try:
> ---> 69             self._reader = parsers.TextReader(self.handles.handle, **kwds)
>      70         except Exception:
>      71             self.handles.close()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader.__cinit__()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._get_header()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._tokenize_rows()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.raise_parser_error()
> 
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position
> 14: invalid start byte
4
  • First: set the right encoding schema for each file (maybe your target file has Latin encoding for example) Second: set error='ignore' to skip the errors! (not recommended) Commented Oct 3, 2021 at 7:24
  • Maybe see stackoverflow.com/questions/30462807/… Commented Oct 3, 2021 at 7:25
  • How do I find out what is the right encoding schema? Commented Oct 3, 2021 at 7:26
  • Also, your code will try to load all files found by os.walk() but only ever resulting in one dataframe. Maybe it's trying treat certain unwanted files as if they're CSV format when they're not. Try printing the filenames. That might give you a clue Commented Oct 3, 2021 at 7:27

1 Answer 1

0

Try to determine your file encoding using chardet package.

Demo:

# Python env: pip install chardet
# Anaconda env: conda install chardet

import chardet
import pathlib

input_path = "../input_data/"
detector = chardet.UniversalDetector()

for filename in pathlib.Path(input_path).glob('*.csv'):
    detector.reset()
    print(f"Filename: {filename}")
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    print(f"Encoding: {detector.result['encoding']} (confidence: {detector.result['confidence']})\n")

Output:

Filename: ../input_data/file1.csv
Encoding: Windows-1252 (confidence: 0.7299263369321677)

Filename: ../input_data/file2.csv
Encoding: ascii (confidence: 1.0)

Filename: ../input_data/file3.csv
Encoding: ISO-8859-1 (confidence: 0.73)

Filename: ../input_data/file4.csv
Encoding: utf-8 (confidence: 0.99)

Filename: ../input_data/file5.csv
Encoding: ISO-8859-1 (confidence: 0.73)
Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.