Dropping grouped rows based on a certain hierarchical column

Question

Suppose I have this pandas dataset:

ID	Question Code
1	Q01
1	Q01-1
1	Q02
2	Q01
2	Q02
2	Q02-1
2	Q02-1-1
2	Q02-2

I want to remove the rows based on certain hierarchical conditions between the values of question codes per ID. For example, Q01-1 is a sub-question when Q01 is answered, soI don't need to keep Q01 anymore since we already have Q01-1. By ID 2, I need to show Q01, Q02-1-1 (since it is a sub-question of Q02-1, which is also one of Q02) and Q02-2 (since it is also another sub-question of Q02).

The desired final result of the table above would be:

ID	Question Code
1	Q01-1
1	Q02
2	Q01
2	Q02-1-1
2	Q02-2

Thanks in advance for the help!

mozway · Accepted Answer · 2025-01-27 13:10:58Z

1

You could extract the part before the trailing -xxx and use this to identify the levels to drop with boolean indexing. Perform per group with groupby.transform:

out = df[df.groupby('ID')['Question Code']
           .transform(lambda x: ~x.isin(x.str.extract('(.*)-[^-]+$',
                                        expand=False).dropna()))]

Output:

   ID Question Code
1   1         Q01-1
2   1           Q02
3   2           Q01
6   2       Q02-1-1
7   2         Q02-2

Example intermediates for ID 2:

# s.str.extract('(.*)-[^-]+$', expand=False).dropna()
5      Q02
6    Q02-1
7      Q02
Name: Question Code, dtype: object

# ~s.isin(s.str.extract('(.*)-[^-]+$', expand=False).dropna())
3     True
4    False
5    False
6     True
7     True
Name: Question Code, dtype: bool

answered Jan 27 at 13:10

mozway

267k13 gold badges56 silver badges106 bronze badges

Sign up to request clarification or add additional context in comments.

Comments

Soudipta Dutta · Accepted Answer · 2025-04-20 13:52:37Z

import pandas as pd
import warnings


data = {
    'ID': [1, 1, 1, 2, 2, 2, 2, 2],
    'Question Code': ['Q01', 'Q01-1', 'Q02', 'Q01', 'Q02', 'Q02-1', 'Q02-1-1', 'Q02-2']
}


question_df = pd.DataFrame(data)
'''
   ID Question Code
0   1           Q01
1   1         Q01-1
2   1           Q02
3   2           Q01
4   2           Q02
5   2         Q02-1
6   2       Q02-1-1
7   2         Q02-2
'''

# Define a function to process each group of questions (grouped by 'ID')
def filter_child_questions(group):
  
    # Get a set of all question codes within the current group
    all_codes = set(group['Question Code'])

    # Identify parent question codes. A parent code is one that appears
    # before a hyphen in another question code within the same group.
    parent_codes = {code.rsplit('-', 1)[0] for code in all_codes if '-' in code}

    # Filter the group to keep only the question codes that are NOT in the set of parent codes.
    # This selects 'child' questions (like 'Q01-1' where 'Q01' is a parent)
    # and top-level questions that don't have any sub-questions (like 'Q02' in ID 1).
    child_or_standalone_questions = group[~group['Question Code'].isin(parent_codes)]
    return child_or_standalone_questions

# Suppress the DeprecationWarning related to DataFrameGroupBy.apply
with warnings.catch_warnings():
    warnings.simplefilter('ignore', category = DeprecationWarning)
    # Group the DataFrame by 'ID' and apply the 'filter_child_questions' function to each group.
    # group_keys=False prevents the group keys ('ID') from becoming part of the index.
    filtered_result = question_df.groupby('ID', group_keys=False).apply(filter_child_questions)#.reset_index(drop=True)

print("\nFiltered Result:")
print(filtered_result)
'''
   ID Question Code
1   1         Q01-1
2   1           Q02
3   2           Q01
6   2       Q02-1-1
7   2         Q02-2
'''

Soudipta Dutta · Accepted Answer · 2025-04-20 16:44:44Z

0

Without creating a function :

import pandas as pd
import warnings

data = {
    'ID': [1, 1, 1, 2, 2, 2, 2, 2],
    'Question Code': ['Q01', 'Q01-1', 'Q02', 'Q01', 'Q02', 'Q02-1', 'Q02-1-1', 'Q02-2']
}

df = pd.DataFrame(data)

with warnings.catch_warnings():
    warnings.simplefilter(action ='ignore', category = DeprecationWarning)

    res = df.groupby('ID',group_keys = False).apply(
          lambda gr : gr[~gr['Question Code'].isin(
          {code.rsplit('-',1)[0] for code in set(gr['Question Code']) if '-' in code})]
).reset_index(drop =True)

print(res)
'''
  ID Question Code
0   1         Q01-1
1   1           Q02
2   2           Q01
3   2       Q02-1-1
4   2         Q02-2
'''

answered Apr 20 at 16:44

Soudipta Dutta

2,0721 gold badge16 silver badges11 bronze badges

Comments

Soudipta Dutta · Accepted Answer · 2025-05-02 09:24:57Z

Polars with Regex :

import polars as pl

df = pl.DataFrame({
    'ID': [1, 1, 1, 2, 2, 2, 2, 2],
    'QuestionCode': ['Q01', 'Q01-1', 'Q02', 'Q01', 'Q02', 'Q02-1', 'Q02-1-1', 'Q02-2']
})

df= df.with_columns(
pl.col('QuestionCode').str.replace(r'-[^-]+$','').alias('BaseCode')    
)
print(df)
'''
┌─────┬──────────────┬──────────┐
│ ID  ┆ QuestionCode ┆ BaseCode │
│ --- ┆ ---          ┆ ---      │
│ i64 ┆ str          ┆ str      │
╞═════╪══════════════╪══════════╡
│ 1   ┆ Q01          ┆ Q01      │
│ 1   ┆ Q01-1        ┆ Q01      │
│ 1   ┆ Q02          ┆ Q02      │
│ 2   ┆ Q01          ┆ Q01      │
│ 2   ┆ Q02          ┆ Q02      │
│ 2   ┆ Q02-1        ┆ Q02      │
│ 2   ┆ Q02-1-1      ┆ Q02-1    │
│ 2   ┆ Q02-2        ┆ Q02      │
└─────┴──────────────┴──────────┘
'''
df = df.join(df.filter(pl.col('BaseCode') != pl.col('QuestionCode')).unique(),
left_on = ['ID','QuestionCode'], right_on = ['ID','BaseCode'],
how = 'anti'
)
'''
shape: (5, 3)
┌─────┬──────────────┬──────────┐
│ ID  ┆ QuestionCode ┆ BaseCode │
│ --- ┆ ---          ┆ ---      │
│ i64 ┆ str          ┆ str      │
╞═════╪══════════════╪══════════╡
│ 1   ┆ Q01-1        ┆ Q01      │
│ 1   ┆ Q02          ┆ Q02      │
│ 2   ┆ Q01          ┆ Q01      │
│ 2   ┆ Q02-1-1      ┆ Q02-1    │
│ 2   ┆ Q02-2        ┆ Q02      │
└─────┴──────────────┴──────────┘
'''

df = df.drop('BaseCode')
print(df) 
'''
shape: (5, 2)
┌─────┬──────────────┐
│ ID  ┆ QuestionCode │
│ --- ┆ ---          │
│ i64 ┆ str          │
╞═════╪══════════════╡
│ 1   ┆ Q01-1        │
│ 1   ┆ Q02          │
│ 2   ┆ Q01          │
│ 2   ┆ Q02-1-1      │
│ 2   ┆ Q02-2        │
└─────┴──────────────┘
'''

Soudipta Dutta · Accepted Answer · 2025-05-13 14:55:20Z

Here, we're using a clever data structure called marisa_trie (pip install marisa-trie). Think of it like a super-efficient way to store and quickly check for prefixes in a list of words or codes.

import numpy as np
import pandas as pd
import marisa_trie

data = {
    'id': [1, 1, 1, 2, 2, 2, 2, 2],
    'code': ['Q01', 'Q01-1', 'Q02', 'Q01', 'Q02', 'Q02-1', 'Q02-1-1', 'Q02-2']
}
df = pd.DataFrame(data)

df = df.sort_values(['id','code'],ignore_index = True)
'''
   id     code
0   1      Q01
1   1    Q01-1
2   1      Q02
3   2      Q01
4   2      Q02
5   2    Q02-1
6   2  Q02-1-1
7   2    Q02-2
'''
keep_mask = np.ones(len(df),dtype = bool)
id_np = df['id'].to_numpy()
code_np = df['code'].to_numpy()
unique_ids = np.unique(id_np)

for uid in unique_ids :
    idx = np.where(id_np == uid)
    gr_codes = code_np[idx]
    gr_codes_list = gr_codes.tolist()
    trie = marisa_trie.Trie(gr_codes_list)
    has_descendant = [any(trie.keys(code + '-')) for code in gr_codes_list]
    has_descendant_np = np.array(has_descendant)
    keep_mask[idx] = ~has_descendant_np

res = df[keep_mask]#.reset_index(drop =True)
print(res)
'''
   id     code
1   1    Q01-1
2   1      Q02
3   2      Q01
6   2  Q02-1-1
7   2    Q02-2
'''

Collectives™ on Stack Overflow

Dropping grouped rows based on a certain hierarchical column

5 Answers 5

Comments

Comments

Comments

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

5 Answers 5

Comments

Comments

Comments

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related