UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte

Question

I want to create a dataframe for all the csv files within my input_path. My code is raising UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte error. I've also tried calling read_csv with encoding='latin1', encoding='iso-8859-1' or encoding='cp1252'.

import os
import pandas as pd

input_path = "../input_data/"


# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            df = pd.read_csv(data, encoding='utf_8')

Traceback:

> --------------------------------------------------------------------------- UnicodeDecodeError                        Traceback (most recent call
> last) /tmp/ipykernel_136/3748812978.py in <module>
>       3     for file in files:
>       4         with open(os.path.join(root, file), "r") as data:
> ----> 5             df = pd.read_csv(data, encoding='utf_8')
> 
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
>     309                     stacklevel=stacklevel,
>     310                 )
> --> 311             return func(*args, **kwargs)
>     312 
>     313         return wrapper
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,
> usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters,
> true_values, false_values, skipinitialspace, skiprows, skipfooter,
> nrows, na_values, keep_default_na, na_filter, verbose,
> skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
> date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
> thousands, decimal, lineterminator, quotechar, quoting, doublequote,
> escapechar, comment, encoding, encoding_errors, dialect,
> error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
> low_memory, memory_map, float_precision, storage_options)
>     584     kwds.update(kwds_defaults)
>     585 
> --> 586     return _read(filepath_or_buffer, kwds)
>     587 
>     588 
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _read(filepath_or_buffer, kwds)
>     480 
>     481     # Create the parser.
> --> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
>     483 
>     484     if chunksize or iterator:
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> __init__(self, f, engine, **kwds)
>     809             self.options["has_index_names"] = kwds["has_index_names"]
>     810 
> --> 811         self._engine = self._make_engine(self.engine)
>     812 
>     813     def close(self):
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _make_engine(self, engine)    1038             )    1039         # error: Too many arguments for "ParserBase"
> -> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]    1041     1042     def
> _failover_to_python(self):
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py
> in __init__(self, src, **kwds)
>      67         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
>      68         try:
> ---> 69             self._reader = parsers.TextReader(self.handles.handle, **kwds)
>      70         except Exception:
>      71             self.handles.close()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader.__cinit__()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._get_header()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._tokenize_rows()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.raise_parser_error()
> 
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position
> 14: invalid start byte

First: set the right encoding schema for each file (maybe your target file has Latin encoding for example) Second: set error='ignore' to skip the errors! (not recommended) — Meti
– Meti, Commented Oct 3, 2021 at 7:24
Also, your code will try to load all files found by os.walk() but only ever resulting in one dataframe. Maybe it's trying treat certain unwanted files as if they're CSV format when they're not. Try printing the filenames. That might give you a clue — user2668284
– user2668284, Commented Oct 3, 2021 at 7:27

Corralien · Accepted Answer · 2021-10-03 07:46:51Z

Try to determine your file encoding using chardet package.

Demo:

# Python env: pip install chardet
# Anaconda env: conda install chardet

import chardet
import pathlib

input_path = "../input_data/"
detector = chardet.UniversalDetector()

for filename in pathlib.Path(input_path).glob('*.csv'):
    detector.reset()
    print(f"Filename: {filename}")
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    print(f"Encoding: {detector.result['encoding']} (confidence: {detector.result['confidence']})\n")

Output:

Filename: ../input_data/file1.csv
Encoding: Windows-1252 (confidence: 0.7299263369321677)

Filename: ../input_data/file2.csv
Encoding: ascii (confidence: 1.0)

Filename: ../input_data/file3.csv
Encoding: ISO-8859-1 (confidence: 0.73)

Filename: ../input_data/file4.csv
Encoding: utf-8 (confidence: 0.99)

Filename: ../input_data/file5.csv
Encoding: ISO-8859-1 (confidence: 0.73)

Collectives™ on Stack Overflow

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte

1 Answer 1

Comments

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Linked

Related