2025-05-12 실전프로젝트 2 - 학습일 별로 데이터 분류

seyeon1130 2025. 5. 12. 21:07

최종 데이터 전처리

# 불필요한 컬럼 드랍
df.drop(['roles', 'incomplete_flag', 'nplay_video'], axis=1, inplace=True)

# 인적사항 결측치 제거 
df.dropna(subset=['LoE_DI', 'YoB', 'gender'], inplace=True)

# ndays_act 조건에 맞는 행 삭제
df = df[~((df['ndays_act'].notnull()) & (df['ndays_act'] >= 2) & (df['last_event_DI'].isnull()))]

# last_event_DI를 start_time_DI로 대체
df['last_event_DI'].fillna(df['start_time_DI'], inplace=True)

#  나머지 결측치는 0으로 대체
df.fillna(0, inplace=True)

#  datetime 형식으로 변환
date_columns = ['start_time_DI', 'last_event_DI']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# last가 start보다 이전인 경우 제거
df = df[~(df['last_event_DI'] < df['start_time_DI'])]

# grade 숫자 변환 및 결측치 처리
df['grade'] = pd.to_numeric(df['grade'], errors='coerce')
df['grade'].fillna(0, inplace=True)

# 나이 컬럼 생성, 10세에서 100세까지 
df['age'] = df['start_time_DI'].dt.year - df['YoB']
df = df[(df['age'] > 10) & (df['age'] < 100)]

#점수 최댓값은 1.0
df = df[df['grade']<=1.0]

#학습일 0일인데 수료한 경우
df = df[~((df['ndays_act']==0)&(df['certified']==1))]

전에꺼에 추가로

나이를 10세에서 100세까지로 추가로 정리함
학습일이 0일인데 수료한 경우 11건 결측치 처리함
last_eventDI가 결측일 경우, 학습일이 0 혹은 1이기 때문에 start_time_DI로 대체함.
last_event_DI가 start_time보다 이전일 경우 제거
grade(점수)가 1.0 이상인 경우 제거

파생변수 생성

나이대 컬럼

#나이대 컬럼 생성
df['age_group'] = pd.cut(df['age'],
                         bins=[10, 19, 29, 39, 49, 59, 69, 79, 89],
                         labels=['10대', '20대', '30대', '40대', '50대', '60대', '70대', '80대'])

강의 명 컬럼

# course 이름
# / 를 기준으로 course_id를 세개의 컬럼으로 분류함
df[['university', 'course_name', 'term']] = df['course_id'].str.split('/', expand=True)

# 딕셔너리 생성해서 매칭
course_name_dict = {
            'CS50x' : 'Introduction to computer science', #  컴퓨터 과학 입문
            'PH207x' : 'Quantitative Methods in Clinical & Public Health Research', # 임상 및 공중 보건 연구의 정량적 방법
            'PH278x' : 'Human Health and Global Environmental Change', # 인간 건강과 지구 환경 변화
            'CB22x' :'The Ancient Greek Hero', # 고대 그리스 영웅 (문학)
            'ER22x' : 'Michael Sandel, Justice', # 마이클 샌델의 정의론
            '6.00x' : 'Introduction to Computer Science and Programming Using Python', #  컴퓨터 프로그래밍 & 파이썬 입문
            '6.002x' : 'Circuits and Electronics', # 회로와 전자공학
            '8.02x' : 'Electricity and Magnetism', #  전기와 자기 (물리학 입문)
            '14.73x' : 'The Challenges of Global Poverty', #  세계 빈곤 위기
            '7.00x' : 'Introduction to Biology', # 생물학 입문
            '3.091x' : 'Introduction to Solid State Chemistry', # 고체 화학 입문
            '8.MReV' : 'Mechanics ReView', # (물리학)
            '2.01x' : 'Elements of Structures'# 구조요소 (구조역학 입문)
}

df['course_title'] = df['course_name'].map(course_name_dict)

학습 일자별 컬럼

def categorize_ndays_act(days):
    if days == 0:
        return "비학습"
    elif days <= 3:
        return "3일 이내 학습"
    elif days <= 7:
        return "3일 이상 학습"
    elif days <= 30:
        return "7일 이상 학습"
    elif days <= 90:
        return "30일 이상 학습"
    else:
        return "90일 이상 학습"

df['activity_category'] = df['ndays_act'].apply(categorize_ndays_act)

activity_order = [
    "비학습", 
    "3일 이내 학습", 
    "3일 이상 학습", 
    "7일 이상 학습", 
    "30일 이상 학습", 
    "90일 이상 학습"
]

# activity_category를 Categorical 타입으로 변환 (순서 지정)
df['activity_category'] = pd.Categorical(df['activity_category'], 
                                         categories=activity_order, 
                                         ordered=True)

학습 상태 컬럼

df['status'] = '' 

# 수강 신청만 하고 아예 보지 않는 수강생
df.loc[df['activity_category'] == '비학습', 'status'] = 'Non-Starter'

# 시청은 했지만 수료하지 못한 수강생
df.loc[(df['activity_category'] != '비학습') & (df['certified'] == 0), 'status'] = 'Non-Completer'

# 시청도 하고 수료도 한 수강생
df.loc[(df['activity_category'] != '비학습') & (df['certified'] == 1), 'status'] = 'Completer'

이후 다른 강의 들었는지 여부

#이후 다른 강의 들었는지 여부
# 사용자별 강의를 시간 순서로 정렬
df = df.sort_values(by=['userid_DI', 'start_time_DI'])

# 사용자별로 수강한 강의 순서 확인
df['next_course'] = df.groupby('userid_DI')['course_id'].shift(-1).notna()

내가 맡은 부분은 학습 일자별 학생 분류

위처럼 학생 일자별 컬럼을 만든 이후 분석 진행

분류 별 카운트

plt.figure(figsize=(12,5))
sns.countplot(x='activity_category',data= df)
for p in plt.gca().patches:
    plt.gca().annotate(f'{int(p.get_height())}', 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='baseline', fontsize=12)
plt.title('학습일 분류 카운트')
plt.show()

분류별 수료 수

c = df.groupby('activity_category')['certified'].sum().reset_index()
plt.figure(figsize=(12,5))
sns.barplot(x='activity_category',y='certified',data=c)
for p in plt.gca().patches:
    plt.gca().annotate(f'{int(p.get_height())}', 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='baseline', fontsize=12)
plt.title('학습일 별 수료 카운트')
plt.show()

수료는 30일 이상 학습자가 가장 많다.

분류별 수료율

# 수료 비율 카테고리별 시각화
certification_rate = df.groupby('activity_category')['certified'].mean().reset_index()
certification_rate['certified'] *= 100
plt.figure(figsize=(12, 6))
sns.barplot(x='activity_category', y='certified', data=certification_rate)
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height():.2f}%', 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='baseline', fontsize=12)
plt.title('카테고리별 수료율 (%)')
plt.show()

수료율은 90일 이상이 가장 많다. 시간이 지날수록 더 늘어나는 편

강의 콘텐츠를 본 비율은?

# 수료 비율 카테고리별 시각화
view_rate = df.groupby('activity_category')['viewed'].mean().reset_index()
view_rate['viewed'] *= 100
plt.figure(figsize=(12, 6))
sns.barplot(x='activity_category', y='viewed', data=view_rate)
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height():.2f}%', 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='baseline', fontsize=12)
plt.title('강의 콘텐츠를 본 비율 (%)')
plt.show()

비학습자도 꽤나 많이 본 것을 보아, 강의 콘텐츠는 강의자료인듯함.

초반만 슥 훑어보고 안본듯

강의 콘텐츠를 적극적으로 본 비율은?

# 콘텐츠 적극적으로 본 비율
explored_rate = df.groupby('activity_category')['explored'].mean().reset_index()
explored_rate['explored'] *= 100
plt.figure(figsize=(12, 6))
sns.barplot(x='activity_category', y='explored', data=explored_rate)
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height():.2f}%', 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='baseline', fontsize=12)
plt.title('강의 콘텐츠를 적극적으로 본 비율 (%)')
plt.show()

적극적으로 봤나 싶었더니 역시나 적극적으로 본 건 오래 공부한 사람들

다음 강의도 본 분류는?

# 사용자별 강의를 시간 순서로 정렬
df = df.sort_values(by=['userid_DI', 'start_time_DI'])

# 사용자별로 수강한 강의 순서 확인
df['next_course'] = df.groupby('userid_DI')['course_id'].shift(-1).notna()

# 액티비티 카테고리별 다음 강의 수강 비율 계산
activity_next_course = df.groupby('activity_category')['next_course'].mean().reset_index()
activity_next_course['next_course'] *= 100

# 시각화
plt.figure(figsize=(12, 6))
sns.barplot(x='activity_category', y='next_course', data=activity_next_course)

for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height():.2f}%', 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='baseline', fontsize=12)

plt.title('액티비티 카테고리별 다음 강의 수강 비율 (시간적 순서 기준)')
plt.show()

오늘의 결론: 한 달이상, 특히 3개월 이상 학습할 경우 수료율도 높고 강의 재구매율이 높음. 따라서 학습을 오래 이끌 수 있는 방안 생각하기.

강의 콘텐츠를 자세히 볼 수록 학습이 길게 이어지는 모습이 보이긴 함.