Python script to combine RTF files in a folder to CSV with filename as a separate column

Question

I've tried a plethora of solutions which don't work. I've figured out how to get the converted data into a csv column, but all text is in 1 cell and I haven't figured out how to get the FileNames added as a column.

import pandas as pd
import os
from striprtf.striprtf import rtf_to_text

dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'


def getFiles():
    list = []
    FileNames = []
    for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path,path)):
            with open (os.path.join(dir_path,path)) as file:
                    text = file.read()
                    rtfText = rtf_to_text(text,encoding='utf-8')
                    for text in rtfText:
                        list.append(rtfText) 
                        FileNames.append(os.path.basename(rtfText))
    return list
    return FileNames

list = getFiles()
FileNames = getFiles()


Data = pd.DataFrame(columns: 'list','FileNames')



NewPath = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'
Data.to_csv(os.path.join(NewPath,r'Data.csv'), index = False, header = False)

I've tried for a few days to scrape Stackoverflow and find a solution and now I seem to be getting duplicate file data in each row ?

I think my main issues are. Possible that I need to create an empty dataframe before the function, but I haven't got it working yet.

separating the text so each new line is a new cell
Distinct file content so there aren't duplicate rows
adding the FileNames.

Hopefully, the outcome looks like this...

Filename	Data
Filename_1	Data line 1
Filename_1	Data line 2
Filename_2	Data line 1
Filename_2	Data line 2
Filename_2	Data line 3

Thank you for any help :)

MzNix · Accepted Answer · 2023-06-06 16:06:16Z

A friend helped me solve this pickle :)

I was converting and saving all text in the document instead of stripping it line by line.

import pandas as pd
import os
from striprtf.striprtf import rtf_to_text

# Data structure
class fileLine:
    def __init__(self, fileName, textLine):
        self.fileName = fileName
        self.textLine = textLine

# Data files directories
data_dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'
output_dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'

# Get all lines of all RTF files in a giver directory
def getRTFLines():
    list = []
    # For each file in the given directory
    for filename in os.listdir(data_dir_path):
        filePath = os.path.join(data_dir_path, filename)
        # Check if it's an RTF file
        if os.path.isfile(filePath) and filePath.endswith('.rtf'):
            # Open the RTF file
            with open(filePath, encoding='utf-8', errors='ignore') as file:
                # Read all the RTF file's content
                text = file.read()
                # Decode the RTF file's content
                rtfText = rtf_to_text(text, encoding='utf-8')
                # For each line in the RTF file
                for line in rtfText.splitlines():
                    # Check if it's an empty line
                    if line:
                        # Insert the line in our data structure
                        list.append(fileLine(filename, line))
    return list

# Read data
list = getRTFLines()

# Convert data to csv file
data = [[x.fileName, x.textLine] for x in list]
df = pd.DataFrame(data, columns=['File Name', 'Text Line'])
df.to_csv(os.path.join(output_dir_path,r'rtfData-3.csv'), escapechar="")

Collectives™ on Stack Overflow

Python script to combine RTF files in a folder to CSV with filename as a separate column

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related