# File Name : ds0101_chkMissingValues.py
# Check missing values

import pandas as pd

df = pd.read_excel('./data/solar_energy_generation.xlsx')

# print(df)
# print(df.describe())

# Check for missing values
print(df.isnull().sum())








# File Name : ds0103_ removeMissingValue.py 
# Save the data to an Excel file after removing missing values

import pandas as pd

# Read Excel file
df = pd.read_excel('./data/solar_energy_generation.xlsx')

# Check missing values
print(df.isnull().sum())

# Remove missing values
df_clean = df.dropna()

# Save cleaned data to Excel file
df_clean.to_excel('./data/removeMissingValue.xlsx',index=False)

# Print result
print('Original data size:', len(df))
print('Cleaned data size:', len(df_clean))







# File Name : ds0105_chkMissingValues.py
# Check missing values

import pandas as pd

df = pd.read_excel('./data/removeMissingValue.xlsx')

# print(df)
# print(df.describe())

# Check for missing values
print(df.isnull().sum())






# File Name : ds0109_chkOutlier_02.py

import pandas as pd
import matplotlib.pyplot as plt

# Read Excel file
df = pd.read_excel('./data/removeMissingValue.xlsx')

# Select outliers
outlier = df[
    (df['½Àµµ'] + df['ÅÂ¾ç±¤ ¹ßÀü·®(kW)']) > 150
]

# Create figure
plt.figure(figsize=(10, 7))

# Draw scatter plot with color mapping
scatter = plt.scatter(
    df['½Àµµ'],
    df['ÅÂ¾ç±¤ ¹ßÀü·®(kW)'],
    c=df['ÅÂ¾ç±¤ ¹ßÀü·®(kW)'],  # Color criterion
    cmap='viridis',             # Blue ¡æ Green ¡æ Yellow
    alpha=0.6,
    s=20
)

# Display color bar
plt.colorbar(
    scatter,
    label='Solar Power (kW)'
)

# Highlight outliers in red
plt.scatter(
    outlier['½Àµµ'],
    outlier['ÅÂ¾ç±¤ ¹ßÀü·®(kW)'],
    color='red',
    s=120,
    edgecolors='black',
    label='Outliers'
)

# Set x-axis label
plt.xlabel('Humidity')

# Set y-axis label
plt.ylabel('Solar Power (kW)')

# Set graph title
plt.title('Outlier Detection')

# Display grid
plt.grid(alpha=0.3)

# Display legend
plt.legend()

# Display graph
plt.show()








# File Name : ds0113_removeOutlier_03.py

import pandas as pd
import matplotlib.pyplot as plt

# Read Excel file
df = pd.read_excel('./data/removeMissingValue.xlsx')

# Print original data size
print('Original data size:', len(df))

# Select outliers
outlier = df[
    (df['½Àµµ'] + df['ÅÂ¾ç±¤ ¹ßÀü·®(kW)']) > 150
]

# Print outlier size
print('Outlier size:', len(outlier))

# Remove outliers
df_clean = df[
    (df['½Àµµ'] + df['ÅÂ¾ç±¤ ¹ßÀü·®(kW)']) <= 150
]

# Print cleaned data size
print('Cleaned data size:', len(df_clean))

# Save cleaned data to an Excel file
df_clean.to_excel(
    './data/solar_energy_generation_no_outlier.xlsx',
    index=False
)

print('Outlier removal completed.')

# ----------------------------------
# Draw scatter plot after outlier removal
# ----------------------------------

# Create figure
plt.figure(figsize=(10, 7))

# Draw scatter plot
scatter = plt.scatter(
    df_clean['½Àµµ'],
    df_clean['ÅÂ¾ç±¤ ¹ßÀü·®(kW)'],
    c=df_clean['ÅÂ¾ç±¤ ¹ßÀü·®(kW)'],
    cmap='viridis',
    alpha=0.6,
    s=30
)

# Display color bar
plt.colorbar(
    scatter,
    label='Solar Power (kW)'
)

# Set x-axis label
plt.xlabel('Humidity')

# Set y-axis label
plt.ylabel('Solar Power (kW)')

# Set graph title
plt.title('Scatter Plot After Outlier Removal')

# Display grid
plt.grid(True, alpha=0.3)

# Display graph
plt.show()











