EDA তে ব্যবহৃত সাধারণ গ্রাফিক্যাল টেকনিকসমূহ — ডেটা বিশ্লেষণে গুরুত্বপূর্ণ ভিজ্যুয়াল সরঞ্জাম

In this article, we'll take a look at Show

এক্সপ্লোরেটরি ডেটা অ্যানালাইসিস (EDA) গুরুত্বপূর্ণ কারণ এটি ডেটা বিজ্ঞানীদের ডেটাসেটের একটি গভীর উপলব্ধি লাভ করতে সাহায্য করে। এখানে ডেটা বিশ্লেষণের জন্য ব্যবহৃত কিছু সাধারণ গ্রাফিকাল এবং পরিমাণগত কৌশল সম্পর্কে আলোচনা করা হলো, সাথে পাইথন ব্যবহার করে কিছু উদাহরণের ধারণা দেওয়া হলো:

গ্রাফিক্যাল কৌশল (Graphical Techniques):

গ্রাফিকাল কৌশলগুলো ডেটার বিতরণ, প্যাটার্ন এবং সম্পর্ক visualise করতে সাহায্য করে।

১. বক্স প্লট (Box Plot):

বক্স প্লট ডেটার কেন্দ্র, বিস্তার এবং আউটলায়ার (outliers) দেখতে সাহায্য করে। এটি ডেটার কোয়ার্টাইল (quartiles), মধ্যমা (median) এবং রেঞ্জ (range) দেখায়।


import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Sample data
data = np.random.normal(loc=0, scale=1, size=100) # Normal distribution
data = np.concatenate([data, [5, -4]]) # Adding outliers

plt.figure(figsize=(8, 6))
sns.boxplot(y=data)
plt.title('Box Plot Example')
plt.ylabel('Value')
plt.show()

import seaborn as sns

import matplotlib.pyplot as plt

import numpy as np

# Sample data

data = np.random.normal(loc=0, scale=1, size=100) # Normal distribution

data = np.concatenate([data, [5, -4]]) # Adding outliers

plt.figure(figsize=(8, 6))

sns.boxplot(y=data)

plt.title('Box Plot Example')

plt.ylabel('Value')

plt.show()

২. হিস্টোগ্রাম (Histogram):

হিস্টোগ্রাম ডেটার ফ্রিকোয়েন্সি বিতরণ (frequency distribution) দেখায়। এটি ডেটার রেঞ্জের মধ্যে বিভিন্ন বিন (bin) তৈরি করে এবং প্রতিটি বিনে কতগুলো ডেটা পয়েন্ট আছে তা গণনা করে একটি বার চার্ট আকারে প্রদর্শন করে।


import matplotlib.pyplot as plt
import numpy as np

# Sample data
data = np.random.normal(loc=0, scale=1, size=1000)

plt.figure(figsize=(8, 6))
plt.hist(data, bins=30, edgecolor='black', alpha=0.7)
plt.title('Histogram Example')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()

import matplotlib.pyplot as plt

import numpy as np

# Sample data

data = np.random.normal(loc=0, scale=1, size=1000)

plt.figure(figsize=(8, 6))

plt.hist(data, bins=30, edgecolor='black', alpha=0.7)

plt.title('Histogram Example')

plt.xlabel('Value')

plt.ylabel('Frequency')

plt.show()

৩. মাল্টি-ভেরিয়েট চার্ট (Multi-Vari Chart):

মাল্টি-ভেরিয়েট চার্ট একটি ভেরিয়েবলের বিভিন্ন উপাদানের (যেমন, ব্যাচ, মেশিন, অপারেটর) মধ্যে ভিন্নতা (variation) তুলনা করতে ব্যবহৃত হয়। এটি তিনটি প্রধান উৎস থেকে ভিন্নতা সনাক্ত করতে সাহায্য করে: উইদিন-গ্রুপ (within-group), বিটুইন-গ্রুপ (between-group), এবং টাইম-টু-টাইম (time-to-time)। পাইথনে সরাসরি Multi-Vari Chart এর জন্য ডেডিকেটেড লাইব্রেরি কম থাকলেও, matplotlib এবং seaborn ব্যবহার করে এটি তৈরি করা যায়, বিশেষ করে যখন ডেটা ফ্রেমকে সঠিক উপায়ে সাজানো হয়।
উদাহরণস্বরূপ, বিভিন্ন শিফটে একটি পণ্যের দৈর্ঘ্যের ভিন্নতা দেখতে:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Sample Data
data = {
    'Shift': np.repeat(['Morning', 'Evening', 'Night'], 30),
    'Batch': np.tile(np.repeat([1, 2, 3], 10), 3),
    'Length': np.random.normal(loc=100, scale=2, size=90)
}
df = pd.DataFrame(data)

# Add some shift-specific variation
df.loc[df['Shift'] == 'Morning', 'Length'] += 1
df.loc[df['Shift'] == 'Evening', 'Length'] -= 0.5
df.loc[df['Shift'] == 'Night', 'Length'] += 0.2

plt.figure(figsize=(12, 7))
sns.boxplot(x='Shift', y='Length', hue='Batch', data=df, palette='viridis')
plt.title('Multi-Vari Chart: Product Length by Shift and Batch')
plt.xlabel('Shift')
plt.ylabel('Product Length')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import numpy as np

# Sample Data

data = {

'Shift': np.repeat(['Morning', 'Evening', 'Night'], 30),

'Batch': np.tile(np.repeat([1, 2, 3], 10), 3),

'Length': np.random.normal(loc=100, scale=2, size=90)

}

df = pd.DataFrame(data)

# Add some shift-specific variation

df.loc[df['Shift'] == 'Morning', 'Length'] += 1

df.loc[df['Shift'] == 'Evening', 'Length'] -= 0.5

df.loc[df['Shift'] == 'Night', 'Length'] += 0.2

plt.figure(figsize=(12, 7))

sns.boxplot(x='Shift', y='Length', hue='Batch', data=df, palette='viridis')

plt.title('Multi-Vari Chart: Product Length by Shift and Batch')

plt.xlabel('Shift')

plt.ylabel('Product Length')

plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

৪. রান চার্ট (Run Chart):

রান চার্ট সময়ের সাথে ডেটা পয়েন্টগুলি প্লট করে এবং একটি কেন্দ্রীয় রেখা (সাধারণত গড় বা মধ্যমা) অন্তর্ভুক্ত করে। এটি সময়ের সাথে ডেটার প্যাটার্ন, প্রবণতা (trends) বা চক্র (cycles) সনাক্ত করতে সহায়ক। এটি কন্ট্রোল চার্টের একটি সহজ রূপ, যেখানে কন্ট্রোল লিমিট (control limits) থাকে না।


import matplotlib.pyplot as plt
import numpy as np

# Sample data representing a process over time
time_points = np.arange(1, 51)
process_data = np.cumsum(np.random.normal(loc=0.1, scale=0.5, size=50)) + 10 # Data with a slight upward trend

plt.figure(figsize=(12, 6))
plt.plot(time_points, process_data, marker='o', linestyle='-', color='blue', markersize=4)
plt.axhline(y=np.mean(process_data), color='red', linestyle='--', label=f'Mean: {np.mean(process_data):.2f}')
plt.title('Run Chart: Process Performance Over Time')
plt.xlabel('Time Point')
plt.ylabel('Measurement')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.show()

import matplotlib.pyplot as plt

import numpy as np

# Sample data representing a process over time

time_points = np.arange(1, 51)

process_data = np.cumsum(np.random.normal(loc=0.1, scale=0.5, size=50)) + 10 # Data with a slight upward trend

plt.figure(figsize=(12, 6))

plt.plot(time_points, process_data, marker='o', linestyle='-', color='blue', markersize=4)

plt.axhline(y=np.mean(process_data), color='red', linestyle='--', label=f'Mean: {np.mean(process_data):.2f}')

plt.title('Run Chart: Process Performance Over Time')

plt.xlabel('Time Point')

plt.ylabel('Measurement')

plt.grid(True, linestyle='--', alpha=0.6)

plt.legend()

plt.show()

৫. প্যারেটো চার্ট (Pareto Chart):

প্যারেটো চার্ট একটি বার চার্ট এবং একটি লাইন গ্রাফের সংমিশ্রণ। এটি বিভিন্ন ক্যাটাগরির ফ্রিকোয়েন্সি (বার) দেখায়, যা সর্বোচ্চ থেকে সর্বনিম্ন ফ্রিকোয়েন্সি অনুসারে সাজানো থাকে, এবং এর সাথে একটি ক্রমবর্ধমান শতাংশ (লাইন) প্লট করে। এটি “80/20 নিয়ম” (Pareto Principle) অনুসরণ করে, যা বলে যে বেশিরভাগ সমস্যা অল্প সংখ্যক কারণের কারণে ঘটে। এটি প্রায়শই সমস্যা সমাধানে অগ্রাধিকার দিতে ব্যবহৃত হয়।


import matplotlib.pyplot as plt
import pandas as pd

# Sample data: types of defects
defects = {
    'Defect Type': ['Missing Part', 'Scratches', 'Wrong Color', 'Cracked', 'Dented', 'Other'],
    'Count': [85, 60, 45, 20, 10, 5]
}
df_defects = pd.DataFrame(defects).sort_values(by='Count', ascending=False)

df_defects['Cumulative_Percentage'] = df_defects['Count'].cumsum() / df_defects['Count'].sum() * 100

fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.bar(df_defects['Defect Type'], df_defects['Count'], color='skyblue')
ax1.set_xlabel('Defect Type')
ax1.set_ylabel('Number of Occurrences', color='skyblue')
ax1.tick_params(axis='y', labelcolor='skyblue')

ax2 = ax1.twinx() # Create a second y-axis
ax2.plot(df_defects['Defect Type'], df_defects['Cumulative_Percentage'], color='red', marker='o', linestyle='-')
ax2.set_ylabel('Cumulative Percentage (%)', color='red')
ax2.tick_params(axis='y', labelcolor='red')
ax2.yaxis.set_major_formatter(plt.FormatStrFormatter('%.0f%%'))

plt.title('Pareto Chart of Product Defects')
fig.tight_layout()
plt.show()

import matplotlib.pyplot as plt

import pandas as pd

# Sample data: types of defects

defects = {

'Defect Type': ['Missing Part', 'Scratches', 'Wrong Color', 'Cracked', 'Dented', 'Other'],

'Count': [85, 60, 45, 20, 10, 5]

}

df_defects = pd.DataFrame(defects).sort_values(by='Count', ascending=False)

df_defects['Cumulative_Percentage'] = df_defects['Count'].cumsum() / df_defects['Count'].sum() * 100

fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.bar(df_defects['Defect Type'], df_defects['Count'], color='skyblue')

ax1.set_xlabel('Defect Type')

ax1.set_ylabel('Number of Occurrences', color='skyblue')

ax1.tick_params(axis='y', labelcolor='skyblue')

ax2 = ax1.twinx() # Create a second y-axis

ax2.plot(df_defects['Defect Type'], df_defects['Cumulative_Percentage'], color='red', marker='o', linestyle='-')

ax2.set_ylabel('Cumulative Percentage (%)', color='red')

ax2.tick_params(axis='y', labelcolor='red')

ax2.yaxis.set_major_formatter(plt.FormatStrFormatter('%.0f%%'))

plt.title('Pareto Chart of Product Defects')

fig.tight_layout()

plt.show()

৬. স্কেটার প্লট (Scatter Plot – 2D/3D):

স্কেটার প্লট দুটি (2D) বা তিনটি (3D) ভেরিয়েবলের মধ্যে সম্পর্ক দেখতে সাহায্য করে। প্রতিটি ডেটা পয়েন্টকে একটি বিন্দু হিসাবে প্লট করা হয়।


import matplotlib.pyplot as plt
import numpy as np

# Sample data
x = np.random.rand(100)
y = 2 * x + 1 + np.random.randn(100) * 0.1 # Linear relationship with noise

plt.figure(figsize=(8, 6))
plt.scatter(x, y)
plt.title('2D Scatter Plot Example')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()

import matplotlib.pyplot as plt

import numpy as np

# Sample data

x = np.random.rand(100)

y = 2 * x + 1 + np.random.randn(100) * 0.1 # Linear relationship with noise

plt.figure(figsize=(8, 6))

plt.scatter(x, y)

plt.title('2D Scatter Plot Example')

plt.xlabel('X-axis')

plt.ylabel('Y-axis')

plt.show()

৭. স্টেম-অ্যান্ড-লিফ প্লট (Stem-and-Leaf Plot):

স্টেম-অ্যান্ড-লিফ প্লট একটি ডেটাসেটের প্রতিটি ডেটা পয়েন্টকে “স্টেম” (প্রথম ডিজিট বা ডিজিটগুলো) এবং “লিফ” (শেষ ডিজিট) এ বিভক্ত করে ডেটার আকার এবং বন্টন দেখায়। এটি হিস্টোগ্রামের মতো কিন্তু প্রতিটি ডেটা পয়েন্ট সংরক্ষণ করে। পাইথনে সরাসরি বিল্ট-ইন ফাংশন না থাকলেও, কাস্টম ফাংশন দিয়ে তৈরি করা যায়।


def stem_and_leaf_plot(data):
    # Sort the data first
    sorted_data = sorted(data)
    
    # Group data by stem
    stems = {}
    for num in sorted_data:
        # Assuming integers for simplicity, can be adapted for decimals
        stem = int(num // 10) # Integer division for stem
        leaf = int(num % 10)  # Remainder for leaf
        if stem not in stems:
            stems[stem] = []
        stems[stem].append(str(leaf))

    # Print the plot
    print("Stem-and-Leaf Plot:")
    for stem in sorted(stems.keys()):
        print(f"{stem} | {' '.join(stems[stem])}")

# Sample data
data = [12, 13, 21, 24, 25, 27, 30, 31, 32, 35, 41, 42, 45, 47, 50, 52, 53, 55, 60, 61, 62]
stem_and_leaf_plot(data)

# Example with different range
data_large = [101, 105, 112, 118, 120, 123, 127, 131, 135, 140]

def stem_and_leaf_plot_flexible(data):
    """
    A more flexible stem-and-leaf plot for various number ranges.
    Determines stem based on the magnitude of numbers.
    """
    if not data:
        print("No data to plot.")
        return

    sorted_data = sorted(data)
    
    # Determine appropriate stem unit (e.g., tens, hundreds, units)
    max_val = max(sorted_data)
    min_val = min(sorted_data)

    if max_val >= 1000:
        stem_divisor = 100
    elif max_val >= 100:
        stem_divisor = 10
    else:
        stem_divisor = 1

    stems = {}
    for num in sorted_data:
        stem = int(num // stem_divisor)
        leaf = int(num % stem_divisor) # The 'leaf' part might be multi-digit if stem_divisor > 1
        
        # Format leaf for display (e.g., ensure it's always two digits if stem_divisor=100)
        if stem_divisor == 100:
            leaf_str = f"{leaf:02d}" # Format as two digits with leading zero if needed
        elif stem_divisor == 10:
            leaf_str = f"{leaf:01d}" # Format as one digit
        else:
            leaf_str = str(leaf) # For single-digit numbers

        if stem not in stems:
            stems[stem] = []
        stems[stem].append(leaf_str)

    print("\nStem-and-Leaf Plot (Flexible):")
    for stem in sorted(stems.keys()):
        print(f"{stem} | {' '.join(stems[stem])}")

stem_and_leaf_plot_flexible(data_large)

def stem_and_leaf_plot(data):

# Sort the data first

sorted_data = sorted(data)

# Group data by stem

stems = {}

for num in sorted_data:

# Assuming integers for simplicity, can be adapted for decimals

stem = int(num // 10) # Integer division for stem

leaf = int(num % 10) # Remainder for leaf

if stem not in stems:

stems[stem] = []

stems[stem].append(str(leaf))

# Print the plot

print("Stem-and-Leaf Plot:")

for stem in sorted(stems.keys()):

print(f"{stem} | {' '.join(stems[stem])}")

# Sample data

data = [12, 13, 21, 24, 25, 27, 30, 31, 32, 35, 41, 42, 45, 47, 50, 52, 53, 55, 60, 61, 62]

stem_and_leaf_plot(data)

# Example with different range

data_large = [101, 105, 112, 118, 120, 123, 127, 131, 135, 140]

def stem_and_leaf_plot_flexible(data):

"""

A more flexible stem-and-leaf plot for various number ranges.

Determines stem based on the magnitude of numbers.

"""

if not data:

print("No data to plot.")

return

sorted_data = sorted(data)

# Determine appropriate stem unit (e.g., tens, hundreds, units)

max_val = max(sorted_data)

min_val = min(sorted_data)

if max_val >= 1000:

stem_divisor = 100

elif max_val >= 100:

stem_divisor = 10

else:

stem_divisor = 1

stems = {}

for num in sorted_data:

stem = int(num // stem_divisor)

leaf = int(num % stem_divisor) # The 'leaf' part might be multi-digit if stem_divisor > 1

# Format leaf for display (e.g., ensure it's always two digits if stem_divisor=100)

if stem_divisor == 100:

leaf_str = f"{leaf:02d}" # Format as two digits with leading zero if needed

elif stem_divisor == 10:

leaf_str = f"{leaf:01d}" # Format as one digit

else:

leaf_str = str(leaf) # For single-digit numbers

if stem not in stems:

stems[stem] = []

stems[stem].append(leaf_str)

print("\nStem-and-Leaf Plot (Flexible):")

for stem in sorted(stems.keys()):

print(f"{stem} | {' '.join(stems[stem])}")

stem_and_leaf_plot_flexible(data_large)

৮. প্যারালাল কোঅর্ডিনেটস (Parallel Coordinates):

প্যারালাল কোঅর্ডিনেটস প্লট উচ্চ-মাত্রিক (high-dimensional) ডেটা ভিজ্যুয়ালাইজ করার জন্য একটি শক্তিশালী টুল। প্রতিটি ভেরিয়েবল একটি উল্লম্ব অক্ষ (vertical axis) দ্বারা প্রতিনিধিত্ব করা হয় এবং প্রতিটি ডেটা পয়েন্ট এই অক্ষগুলির মধ্য দিয়ে একটি লাইন হিসাবে দেখানো হয়। এটি বিভিন্ন ভেরিয়েবলের মধ্যে সম্পর্ক এবং ক্লাস্টারিং প্যাটার্ন (clustering patterns) সনাক্ত করতে সাহায্য করে।


import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
import numpy as np

# Sample data (e.g., Iris dataset)
from sklearn.datasets import load_iris
iris = load_iris()
df_iris = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                       columns=iris['feature_names'] + ['target'])
df_iris['target'] = df_iris['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

plt.figure(figsize=(10, 6))
parallel_coordinates(df_iris, 'target', colormap='viridis')
plt.title('Parallel Coordinates Plot of Iris Dataset')
plt.xlabel('Features')
plt.ylabel('Value')
plt.show()

import pandas as pd

import matplotlib.pyplot as plt

from pandas.plotting import parallel_coordinates

import numpy as np

# Sample data (e.g., Iris dataset)

from sklearn.datasets import load_iris

iris = load_iris()

df_iris = pd.DataFrame(data=np.c_[iris['data'], iris['target']],

columns=iris['feature_names'] + ['target'])

df_iris['target'] = df_iris['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

plt.figure(figsize=(10, 6))

parallel_coordinates(df_iris, 'target', colormap='viridis')

plt.title('Parallel Coordinates Plot of Iris Dataset')

plt.xlabel('Features')

plt.ylabel('Value')

plt.show()

৯. অডস রেশিও (Odds Ratio):

অডস রেশিও দুটি বাইনারি ইভেন্টের (binary events) মধ্যে সম্পর্ক পরিমাপ করে, বিশেষ করে কেস-কন্ট্রোল স্টাডিতে। এটি নির্দেশ করে যে একটি ইভেন্ট ঘটার সম্ভাবনা অন্য ইভেন্ট ঘটার সাপেক্ষে কতগুণ বেশি বা কম। এটি গ্রাফিক্যাল কৌশল নয়, বরং একটি পরিমাণগত পরিমাপ, তবে এটি প্রায়শই গ্রাফের মাধ্যমেও উপস্থাপন করা হয়। উদাহরণস্বরূপ, অসুস্থতার সাথে ধূমপানের সম্পর্ক:
$O dd s_R a t i o =$
$f r a c (A / B) (C / D) =$
$f r a c A t im esD B t im es C$
যেখানে:

A = এক্সপোজড এবং অসুস্থ
B = এক্সপোজড এবং সুস্থ
C = আন-এক্সপোজড এবং অসুস্থ
D = আন-এক্সপোজড এবং সুস্থ


def calculate_odds_ratio(a, b, c, d):
    """
    Calculates the Odds Ratio.
    a: Exposed and Diseased
    b: Exposed and Healthy
    c: Unexposed and Diseased
    d: Unexposed and Healthy
    """
    if b == 0 or d == 0 or c == 0: # Avoid division by zero, though c is numerator in (C/D)
        print("Warning: Division by zero possible. Check your counts.")
        return None
    
    odds_exposed = a / b
    odds_unexposed = c / d
    
    if odds_unexposed == 0: # Avoid division by zero for the final ratio
        return float('inf') # Infinite odds ratio if unexposed group has no "disease"
    
    odds_ratio = odds_exposed / odds_unexposed
    return odds_ratio

# Example: Smoking and Lung Cancer
# Let's say we have a study:
# 70 smokers got lung cancer (A)
# 30 smokers did not get lung cancer (B)
# 10 non-smokers got lung cancer (C)
# 90 non-smokers did not get lung cancer (D)

A_cancer_smoker = 70
B_no_cancer_smoker = 30
C_cancer_non_smoker = 10
D_no_cancer_non_smoker = 90

or_value = calculate_odds_ratio(A_cancer_smoker, B_no_cancer_smoker, C_cancer_non_smoker, D_no_cancer_non_smoker)

if or_value is not None:
    print(f"Odds Ratio for Lung Cancer (Smoker vs Non-Smoker): {or_value:.2f}")
    if or_value > 1:
        print("Interpretation: Being a smoker increases the odds of getting lung cancer.")
    elif or_value < 1:
        print("Interpretation: Being a smoker decreases the odds of getting lung cancer.")
    else:
        print("Interpretation: No association between smoking and lung cancer odds.")

def calculate_odds_ratio(a, b, c, d):

"""

Calculates the Odds Ratio.

a: Exposed and Diseased

b: Exposed and Healthy

c: Unexposed and Diseased

d: Unexposed and Healthy

"""

if b == 0 or d == 0 or c == 0: # Avoid division by zero, though c is numerator in (C/D)

print("Warning: Division by zero possible. Check your counts.")

return None

odds_exposed = a / b

odds_unexposed = c / d

if odds_unexposed == 0: # Avoid division by zero for the final ratio

return float('inf') # Infinite odds ratio if unexposed group has no "disease"

odds_ratio = odds_exposed / odds_unexposed

return odds_ratio

# Example: Smoking and Lung Cancer

# Let's say we have a study:

# 70 smokers got lung cancer (A)

# 30 smokers did not get lung cancer (B)

# 10 non-smokers got lung cancer (C)

# 90 non-smokers did not get lung cancer (D)

A_cancer_smoker = 70

B_no_cancer_smoker = 30

C_cancer_non_smoker = 10

D_no_cancer_non_smoker = 90

or_value = calculate_odds_ratio(A_cancer_smoker, B_no_cancer_smoker, C_cancer_non_smoker, D_no_cancer_non_smoker)

if or_value is not None:

print(f"Odds Ratio for Lung Cancer (Smoker vs Non-Smoker): {or_value:.2f}")

if or_value > 1:

print("Interpretation: Being a smoker increases the odds of getting lung cancer.")

elif or_value < 1:

print("Interpretation: Being a smoker decreases the odds of getting lung cancer.")

else:

print("Interpretation: No association between smoking and lung cancer odds.")

১০. টার্গেটেড প্রোজেকশন পারসুইট (Targeted Projection Pursuit):

টার্গেটেড প্রোজেকশন পারসুইট (TPP) হলো একটি ডাইমেনশনালিটি রিডাকশন কৌশল যা ডেটার “цікаві” (interesting) প্রোজেকশনগুলি খুঁজে বের করার চেষ্টা করে, যা সাধারণত ডেটার মধ্যে অ-গাউসিয়ান (non-Gaussian) বা অ-রৈখিক (non-linear) প্যাটার্নগুলিকে প্রকাশ করে। PCA-এর মতো যা সর্বোচ্চ ভ্যারিয়েন্স সহ প্রোজেকশন খুঁজে পায়, TPP ডেটার নির্দিষ্ট কাঠামোগত বৈশিষ্ট্যগুলি (যেমন, ক্লাস্টার, আউটলায়ার) হাইলাইট করার জন্য ডিজাইন করা “প্রোজেকশন ইনডেক্স” (projection index) ব্যবহার করে। এটি পাইথনে সরাসরি scikit-learn এ অন্তর্ভুক্ত না থাকলেও, কিছু গবেষক লাইব্রেরি বা কাস্টম ইমপ্লিমেন্টেশন এর জন্য পাওয়া যায়। এটি একটি জটিল অ্যালগরিদম।

১১. হিট ম্যাপ (Heat Map):

হিট ম্যাপ ডেটা ম্যাট্রিক্সের (data matrix) মানগুলো কালার কোডিংয়ের (color coding) মাধ্যমে ভিজ্যুয়ালাইজ করে। এটি প্রায়শই পারস্পরিক সম্পর্ক (correlation) ম্যাট্রিক্স দেখাতে ব্যবহৃত হয়।


import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Sample correlation matrix
data = {
    'A': [1.0, 0.8, 0.2, 0.5],
    'B': [0.8, 1.0, 0.6, 0.3],
    'C': [0.2, 0.6, 1.0, 0.9],
    'D': [0.5, 0.3, 0.9, 1.0]
}
df = pd.DataFrame(data)
correlation_matrix = df.corr()

plt.figure(figsize=(8, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heat Map (Correlation Matrix) Example')
plt.show()

import seaborn as sns

import matplotlib.pyplot as plt

import pandas as pd

# Sample correlation matrix

data = {

'A': [1.0, 0.8, 0.2, 0.5],

'B': [0.8, 1.0, 0.6, 0.3],

'C': [0.2, 0.6, 1.0, 0.9],

'D': [0.5, 0.3, 0.9, 1.0]

}

df = pd.DataFrame(data)

correlation_matrix = df.corr()

plt.figure(figsize=(8, 7))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title('Heat Map (Correlation Matrix) Example')

plt.show()

১২. বার চার্ট (Bar Chart):

বার চার্ট বিভিন্ন ক্যাটাগরির ডেটা তুলনা করার জন্য ব্যবহৃত হয়। এটি ক্যাটাগরিগুলোর মানদণ্ড (magnitude) বার বা আয়তক্ষেত্রের মাধ্যমে দেখায়।


import matplotlib.pyplot as plt

# Sample data
categories = ['A', 'B', 'C', 'D']
values = [10, 25, 15, 30]

plt.figure(figsize=(8, 6))
plt.bar(categories, values, color='skyblue')
plt.title('Bar Chart Example')
plt.xlabel('Categories')
plt.ylabel('Values')
plt.show()

import matplotlib.pyplot as plt

# Sample data

categories = ['A', 'B', 'C', 'D']

values = [10, 25, 15, 30]

plt.figure(figsize=(8, 6))

plt.bar(categories, values, color='skyblue')

plt.title('Bar Chart Example')

plt.xlabel('Categories')

plt.ylabel('Values')

plt.show()

১৪. হরাইজন গ্রাফ (Horizon Graph):

হরাইজন গ্রাফ একটি টাইম সিরিজ ডেটা ভিজ্যুয়ালাইজেশনের জন্য ব্যবহৃত হয় যেখানে একাধিক টাইম সিরিজকে সংক্ষিপ্ত এবং ঘনভাবে প্লট করা যায়। এটি লাইন গ্রাফের একটি উন্নত রূপ, যা একই সাথে অনেক সিরিজকে কার্যকরভাবে তুলনা করতে সাহায্য করে, বিশেষ করে যখন স্থান সীমিত থাকে। এটি ডেটা রেঞ্জকে বিভিন্ন শেডেড ব্যান্ডে বিভক্ত করে এবং নেতিবাচক মানগুলিকে উল্টে (inverted) দেখায়। পাইথনে pyhorizon বা কাস্টম matplotlib ফাংশন ব্যবহার করে এটি তৈরি করা যেতে পারে।


# Horizon Graph (conceptual example, requires specific library or complex matplotlib)
# This would typically involve:
# 1. Defining color bands for different value ranges.
# 2. Plotting the line and filling segments based on value.
# 3. Inverting negative values and overlaying.

# Example structure using a conceptual approach:
import matplotlib.pyplot as plt
import numpy as np

# Sample time series data
time_points = np.arange(100)
series_data = np.sin(time_points / 10) * 5 + np.random.normal(0, 0.5, 100) + 2

plt.figure(figsize=(10, 3))
plt.plot(time_points, series_data, color='blue', alpha=0.7)
plt.fill_between(time_points, 0, series_data, where=(series_data > 0), color='lightblue', alpha=0.4)
plt.fill_between(time_points, 0, series_data, where=(series_data < 0), color='salmon', alpha=0.4)
plt.axhline(0, color='gray', linestyle='--', linewidth=0.8) # Baseline

# In a true horizon graph, this would be layered with different color intensities
# and negative values folded over. This is a simplified representation.
plt.title('Simplified Horizon Graph Concept')
plt.xlabel('Time')
plt.ylabel('Value')
plt.show()

# Horizon Graph (conceptual example, requires specific library or complex matplotlib)

# This would typically involve:

# 1. Defining color bands for different value ranges.

# 2. Plotting the line and filling segments based on value.

# 3. Inverting negative values and overlaying.

# Example structure using a conceptual approach:

import matplotlib.pyplot as plt

import numpy as np

# Sample time series data

time_points = np.arange(100)

series_data = np.sin(time_points / 10) * 5 + np.random.normal(0, 0.5, 100) + 2

plt.figure(figsize=(10, 3))

plt.plot(time_points, series_data, color='blue', alpha=0.7)

plt.fill_between(time_points, 0, series_data, where=(series_data > 0), color='lightblue', alpha=0.4)

plt.fill_between(time_points, 0, series_data, where=(series_data < 0), color='salmon', alpha=0.4)

plt.axhline(0, color='gray', linestyle='--', linewidth=0.8) # Baseline

# In a true horizon graph, this would be layered with different color intensities

# and negative values folded over. This is a simplified representation.

plt.title('Simplified Horizon Graph Concept')

plt.xlabel('Time')

plt.ylabel('Value')

plt.show()

১৫. গ্লিফ-ভিত্তিক ভিজ্যুয়ালাইজেশন পদ্ধতি (Glyph-based Visualization Methods) যেমন PhenoPlot এবং Chernoff Faces:

গ্লিফ-ভিত্তিক ভিজ্যুয়ালাইজেশন ডেটা পয়েন্টগুলিকে ছোট প্রতীক (glyphs) বা “মুখ” (Chernoff Faces) দ্বারা প্রতিনিধিত্ব করে, যেখানে প্রতিটি ডেটা ভেরিয়েবল গ্লিফের একটি নির্দিষ্ট বৈশিষ্ট্য (যেমন, আকার, রঙ, মুখভঙ্গি) দ্বারা ম্যাপ করা হয়। এটি উচ্চ-মাত্রিক ডেটার মধ্যে প্যাটার্ন এবং সম্পর্কগুলি উপলব্ধি করতে সাহায্য করে।

PhenoPlot: সাধারণত ফেনোটাইপিক ডেটা (যেমন, জীববিজ্ঞান বা স্বাস্থ্য ক্ষেত্রে) ভিজ্যুয়ালাইজ করতে ব্যবহৃত হয়, যেখানে প্রতিটি ডেটা পয়েন্ট একটি চক্রাকার প্লটে গ্লিফ দ্বারা প্রতিনিধিত্ব করা হয়।
Chernoff Faces: ডেটার প্রতিটি ভেরিয়েবলকে মুখের বিভিন্ন বৈশিষ্ট্য (যেমন, চোখের আকার, নাকের দৈর্ঘ্য, মুখের বক্রতা) দ্বারা ম্যাপ করা হয়। এটি ডেটার ক্লাস্টার বা আউটলায়ারগুলি মানুষের মুখ চেনার ক্ষমতা ব্যবহার করে দ্রুত সনাক্ত করতে সাহায্য করে। matplotlib ব্যবহার করে Chernoff Faces কাস্টমাইজ করে তৈরি করা যেতে পারে, তবে এটি তুলনামূলকভাবে জটিল।


# Chernoff Faces (Conceptual - requires complex drawing logic)
# A full implementation of Chernoff Faces is quite involved,
# as it requires mapping each data feature to a specific facial attribute.
# No direct library exists in popular Python visualization tools for this.
# You would essentially draw circles, arcs, lines for each face,
# with parameters for each feature like:
# - Eye size proportional to feature X
# - Mouth curvature proportional to feature Y
# - Nose length proportional to feature Z
# etc.

# Chernoff Faces (Conceptual - requires complex drawing logic)

# A full implementation of Chernoff Faces is quite involved,

# as it requires mapping each data feature to a specific facial attribute.

# No direct library exists in popular Python visualization tools for this.

# You would essentially draw circles, arcs, lines for each face,

# with parameters for each feature like:

# - Eye size proportional to feature X

# - Mouth curvature proportional to feature Y

# - Nose length proportional to feature Z

# etc.

১৬. প্রোজেকশন পদ্ধতি (Projection Methods) যেমন grand tour, guided tour এবং manual tour:

এগুলো ডেটার উচ্চ-মাত্রিক স্থানের (high-dimensional space) ইন্টারেক্টিভ অন্বেষণের কৌশল। ডেটাকে একটি নিম্ন-মাত্রিক (সাধারণত 2D) উপস্থানে প্রজেক্ট করা হয়, এবং এই প্রজেকশনগুলো অ্যানিমেশনের মাধ্যমে পরিবর্তন করা হয়, যা ডেটার বিভিন্ন দৃষ্টিভঙ্গি প্রকাশ করে।

Grand Tour: এটি উচ্চ-মাত্রিক ডেটার সমস্ত সম্ভাব্য 2D প্রজেকশনগুলি অন্বেষণ করে। এটি এলোমেলোভাবে প্রজেকশনের দিক পরিবর্তন করে ডেটা পয়েন্টগুলির বিভিন্ন সম্ভাব্য ক্লাস্টার এবং প্যাটার্ন প্রকাশ করে।
Guided Tour: গ্র্যান্ড ট্যুরের মতো, তবে এটি নির্দিষ্ট “цікаві” প্রোজেকশনগুলিতে ফোকাস করে, যেমন ডেটা ঘনত্বের পরিবর্তন বা আউটলায়ারগুলির উপস্থিতি।
Manual Tour: ব্যবহারকারী নিজেই প্রজেকশনের দিকগুলি ম্যানুয়ালি নিয়ন্ত্রণ করে ডেটা অন্বেষণ করে।

এই কৌশলগুলি সাধারণত ইন্টারেক্টিভ ভিজ্যুয়ালাইজেশন লাইব্রেরি যেমন ggobi (যা R/C++ ভিত্তিক) বা কিছু পাইথন লাইব্রেরিতে (যেমন plotly বা ipyvolume এর কিছু এক্সটেনশন) পাওয়া যেতে পারে, তবে এর জন্য বিশেষভাবে ডিজাইন করা প্যাকেজ সাধারণত R-এ বেশি প্রচলিত।

ডাইমেনশনালিটি রিডাকশন (Dimensionality Reduction):

ডাইমেনশনালিটি রিডাকশন হলো ডেটাসেটের ভেরিয়েবলের সংখ্যা কমানোর প্রক্রিয়া, যেখানে মূল ডেটার বেশিরভাগ গুরুত্বপূর্ণ তথ্য ধরে রাখা হয়।

১. মাল্টিডাইমেনশনাল স্কেলিং (Multidimensional Scaling – MDS):

MDS ডেটা পয়েন্টগুলির মধ্যে বিদ্যমান দূরত্বের উপর ভিত্তি করে একটি নিম্ন-মাত্রিক উপস্থাপনা তৈরি করে। এটি ডেটা পয়েন্টগুলির আপেক্ষিক দূরত্ব বা অ-সাদৃশ্য (dissimilarities) বজায় রেখে ডেটাকে কম মাত্রায় প্রজেক্ট করে। এটি ডেটার অন্তর্নিহিত জ্যামিতিক কাঠামো (geometric structure) প্রকাশ করতে সহায়ক।


from sklearn.manifold import MDS
import matplotlib.pyplot as plt
import numpy as np

# Sample data (e.g., distances between cities)
# Let's create some synthetic distances
# Assume 4 points and their distance matrix
# Distances from A: A=0, B=2, C=3, D=4
# Distances from B: B=0, C=1, D=2
# Distances from C: C=0, D=1
# Distances from D: D=0
# So, the full symmetric distance matrix would be:
dissimilarities = np.array([
    [0, 2, 3, 4],
    [2, 0, 1, 2],
    [3, 1, 0, 1],
    [4, 2, 1, 0]
])

# Create MDS model, reducing to 2 dimensions
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
# The dissimilarity='precomputed' tells MDS we are providing a distance matrix directly

# Fit the model and transform the data
# This will return the coordinates of the points in the lower dimension
embedding = mds.fit_transform(dissimilarities)

plt.figure(figsize=(8, 6))
plt.scatter(embedding[:, 0], embedding[:, 1], s=100)
for i, (x, y) in enumerate(embedding):
    plt.annotate(f'Point {i+1}', (x, y), textcoords="offset points", xytext=(5,5), ha='center')
plt.title('Multidimensional Scaling (MDS) Example')
plt.xlabel('MDS Dimension 1')
plt.ylabel('MDS Dimension 2')
plt.grid(True)
plt.show()

from sklearn.manifold import MDS

import matplotlib.pyplot as plt

import numpy as np

# Sample data (e.g., distances between cities)

# Let's create some synthetic distances

# Assume 4 points and their distance matrix

# Distances from A: A=0, B=2, C=3, D=4

# Distances from B: B=0, C=1, D=2

# Distances from C: C=0, D=1

# Distances from D: D=0

# So, the full symmetric distance matrix would be:

dissimilarities = np.array([

[0, 2, 3, 4],

[2, 0, 1, 2],

[3, 1, 0, 1],

[4, 2, 1, 0]

])

# Create MDS model, reducing to 2 dimensions

mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)

# The dissimilarity='precomputed' tells MDS we are providing a distance matrix directly

# Fit the model and transform the data

# This will return the coordinates of the points in the lower dimension

embedding = mds.fit_transform(dissimilarities)

plt.figure(figsize=(8, 6))

plt.scatter(embedding[:, 0], embedding[:, 1], s=100)

for i, (x, y) in enumerate(embedding):

plt.annotate(f'Point {i+1}', (x, y), textcoords="offset points", xytext=(5,5), ha='center')

plt.title('Multidimensional Scaling (MDS) Example')

plt.xlabel('MDS Dimension 1')

plt.ylabel('MDS Dimension 2')

plt.grid(True)

plt.show()

১. প্রিন্সিপাল কম্পোনেন্ট অ্যানালাইসিস (Principal Component Analysis – PCA):

PCA একটি রৈখিক ডাইমেনশনালিটি রিডাকশন কৌশল যা ডেটাকে অর্থোগোনাল (orthogonal) কম্পোনেন্টগুলিতে রূপান্তর করে, যা প্রিন্সিপাল কম্পোনেন্টস নামে পরিচিত। এটি ডেটার ভ্যারিয়েন্সের বেশিরভাগ অংশকে অল্প সংখ্যক কম্পোনেন্টে ধরে রাখে।


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Sample data (e.g., Iris dataset)
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

# Standardize the data before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2) # Reduce to 2 dimensions
principal_components = pca.fit_transform(X_scaled)

# Create a DataFrame for visualization
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Target'] = y

plt.figure(figsize=(10, 8))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Target', data=pca_df, palette='viridis')
plt.title('PCA of Iris Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

print(f"Explained variance ratio by each principal component: {pca.explained_variance_ratio_}")
print(f"Total explained variance by 2 components: {pca.explained_variance_ratio_.sum()}")

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

# Sample data (e.g., Iris dataset)

from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data

y = iris.target

# Standardize the data before PCA

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Apply PCA

pca = PCA(n_components=2) # Reduce to 2 dimensions

principal_components = pca.fit_transform(X_scaled)

# Create a DataFrame for visualization

pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])

pca_df['Target'] = y

plt.figure(figsize=(10, 8))

sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Target', data=pca_df, palette='viridis')

plt.title('PCA of Iris Dataset')

plt.xlabel('Principal Component 1')

plt.ylabel('Principal Component 2')

plt.show()

print(f"Explained variance ratio by each principal component: {pca.explained_variance_ratio_}")

print(f"Total explained variance by 2 components: {pca.explained_variance_ratio_.sum()}")

৩. মাল্টিলিনিয়ার পিসিএ (Multilinear PCA – MPCA):

মাল্টিলিনিয়ার পিসিএ (MPCA) হলো প্রিন্সিপাল কম্পোনেন্ট অ্যানালাইসিসের একটি এক্সটেনশন যা মাল্টিলিনিয়ার ডেটা (যেমন, টেনসর ডেটা, যেখানে ডেটার একাধিক মোড বা মাত্রা থাকে, যেমন ইমেজ টাইম সিরিজ) নিয়ে কাজ করার জন্য ডিজাইন করা হয়েছে। এটি প্রতিটি মোড বরাবর ভ্যারিয়েন্সকে আলাদাভাবে অপ্টিমাইজ করে, যা উচ্চ-মাত্রিক টেনসর ডেটা হ্রাস করার জন্য কার্যকর। scikit-learn এ এটি সরাসরি নেই, তবে tensorly এর মতো লাইব্রেরিতে টেনসর ডিকম্পোজিশন পদ্ধতি আছে যা MPCA-এর ধারণার সাথে সম্পর্কিত।

৪. নন-লিনিয়ার ডাইমেনশনালিটি রিডাকশন (Nonlinear Dimensionality Reduction – NLDR):

NLDR কৌশলগুলি ডেটার মধ্যে অ-রৈখিক সম্পর্ক (non-linear relationships) ক্যাপচার করার চেষ্টা করে এবং একটি নিম্ন-মাত্রিক উপস্থান খুঁজে বের করে যেখানে এই সম্পর্কগুলি ভালভাবে বজায় থাকে। এর মধ্যে অনেক অ্যালগরিদম রয়েছে, যেমন:

Isomap: ডেটা পয়েন্টগুলির মধ্যে জিওডেসিক দূরত্ব (geodesic distances) ব্যবহার করে একটি নিম্ন-মাত্রিক এম্বেডিং তৈরি করে, যা ডেটার অন্তর্নিহিত “ম্যানিফোল্ড” (manifold) কাঠামো সংরক্ষণ করে।
Locally Linear Embedding (LLE): প্রতিটি ডেটা পয়েন্টকে তার প্রতিবেশীদের একটি রৈখিক সংমিশ্রণ (linear combination) হিসাবে বর্ণনা করে এবং তারপর এই প্রতিবেশীর সম্পর্ক বজায় রেখে একটি নিম্ন-মাত্রিক এম্বেডিং খুঁজে বের করে।
t-distributed Stochastic Neighbor Embedding (t-SNE): বিশেষত উচ্চ-মাত্রিক ডেটাকে 2D বা 3D তে ভিজ্যুয়ালাইজ করার জন্য কার্যকর। এটি ডেটা পয়েন্টগুলির মধ্যে উচ্চ-মাত্রিক স্থান এবং নিম্ন-মাত্রিক স্থানের মধ্যে সাদৃশ্য (similarities) বজায় রাখার চেষ্টা করে, যা ক্লাস্টারগুলিকে পরিষ্কারভাবে দেখাতে সাহায্য করে।


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Sample data (e.g., Iris dataset)
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30) # n_components is typically 2 or 3
tsne_results = tsne.fit_transform(X)

# Create a DataFrame for visualization
tsne_df = pd.DataFrame(data=tsne_results, columns=['t-SNE Dimension 1', 't-SNE Dimension 2'])
tsne_df['Target'] = y
tsne_df['Target'] = tsne_df['Target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})


plt.figure(figsize=(10, 8))
sns.scatterplot(x='t-SNE Dimension 1', y='t-SNE Dimension 2', hue='Target', data=tsne_df, palette='viridis', s=70)
plt.title('t-SNE of Iris Dataset')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()

from sklearn.manifold import TSNE

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

# Sample data (e.g., Iris dataset)

from sklearn.datasets import load_iris

iris = load_iris()

X = iris.data

y = iris.target

# Apply t-SNE

tsne = TSNE(n_components=2, random_state=42, perplexity=30) # n_components is typically 2 or 3

tsne_results = tsne.fit_transform(X)

# Create a DataFrame for visualization

tsne_df = pd.DataFrame(data=tsne_results, columns=['t-SNE Dimension 1', 't-SNE Dimension 2'])

tsne_df['Target'] = y

tsne_df['Target'] = tsne_df['Target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

plt.figure(figsize=(10, 8))

sns.scatterplot(x='t-SNE Dimension 1', y='t-SNE Dimension 2', hue='Target', data=tsne_df, palette='viridis', s=70)

plt.title('t-SNE of Iris Dataset')

plt.xlabel('t-SNE Dimension 1')

plt.ylabel('t-SNE Dimension 2')

plt.show()

৫. আইকনোগ্রাফি অফ কোরিলেশনস (Iconography of Correlations):

এটি ডাইমেনশনালিটি রিডাকশন বা ভিজ্যুয়ালাইজেশনের একটি পদ্ধতি যেখানে ডেটা ভেরিয়েবলগুলির মধ্যে পারস্পরিক সম্পর্ক (correlations) বা অন্যান্য সম্পর্কগুলি আইকনিক চিত্র বা গ্লিফ ব্যবহার করে উপস্থাপন করা হয়। এটি একটি দৃশ্যত সমৃদ্ধ উপায়ে সম্পর্কগুলি বুঝতে সাহায্য করে। Chernoff Faces-এর মতো, এটিও গ্লিফ-ভিত্তিক ভিজ্যুয়ালাইজেশনের একটি রূপ যা মানুষের চাক্ষুষ স্বীকৃতির ক্ষমতা ব্যবহার করে ডেটা বিশ্লেষণ করে। এর জন্য কাস্টম ভিজ্যুয়ালাইজেশন তৈরির প্রয়োজন হতে পারে, কারণ এর জন্য সরাসরি পাইথন লাইব্রেরি সহজলভ্য নয়।

পরিমাণগত কৌশল (Quantitative Techniques):

পরিমাণগত কৌশলগুলো ডেটা থেকে সংখ্যাসূচক সারাংশ এবং পরিসংখ্যানগত পরিমাপ পেতে ব্যবহৃত হয়।

১. মিডিয়ান পলিশ (Median Polish):

মিডিয়ান পলিশ হল একটি এডিএ কৌশল যা দ্বি-পথ বা বহু-পথ সারণীর (two-way or multi-way table) ডেটা থেকে সারি এবং কলামের প্রভাব (row and column effects) অনুমান করতে এবং ডেটার অবশিষ্ট অংশ (residuals) পরিমাপ করতে ব্যবহৃত হয়। এটি গড় (mean) এর পরিবর্তে মধ্যমা (median) ব্যবহার করে, যা আউটলায়ারগুলির প্রতি কম সংবেদনশীল। এটি প্রায়শই R-এ ব্যবহৃত হয় এবং পাইথনে সরাসরি একটি জনপ্রিয় লাইব্রেরিতে নাও থাকতে পারে, তবে এটি ম্যাট্রিক্স অপারেশন ব্যবহার করে ম্যানুয়ালি প্রয়োগ করা যেতে পারে।
একটি সরলীকৃত ধারণার জন্য: ধরা যাক, আমাদের কাছে একটি ডেটা ম্যাট্রিক্স আছে। মিডিয়ান পলিশ ডেটা থেকে সারি মধ্যমা এবং কলাম মধ্যমা বিয়োগ করে এবং একটি সাধারণ মধ্যমা যোগ করে ডেটাকে আরও বোধগম্য করে তোলে।


import numpy as np

def median_polish_simple(data_matrix, max_iter=10):
    matrix = np.array(data_matrix, dtype=float)
    rows, cols = matrix.shape
    row_effects = np.zeros(rows)
    col_effects = np.zeros(cols)
    overall_effect = 0.0
    residuals = np.copy(matrix)

    for _ in range(max_iter):
        # Remove overall effect from residuals for row/col median calculation
        residuals_centered = residuals - overall_effect

        # Estimate row effects and remove from residuals
        new_row_effects = np.median(residuals_centered, axis=1)
        residuals -= new_row_effects[:, np.newaxis]
        row_effects += new_row_effects

        # Estimate column effects and remove from residuals
        new_col_effects = np.median(residuals, axis=0)
        residuals -= new_col_effects[np.newaxis, :]
        col_effects += new_col_effects

        # Estimate overall effect and remove from residuals
        new_overall_effect = np.median(residuals)
        residuals -= new_overall_effect
        overall_effect += new_overall_effect

    return overall_effect, row_effects, col_effects, residuals

# Sample data matrix
data_matrix = np.array([
    [10, 12, 11],
    [15, 18, 16],
    [20, 21, 19]
])

overall, rows, cols, res = median_polish_simple(data_matrix)

print("Original Matrix:\n", data_matrix)
print("\nOverall Effect:", overall)
print("Row Effects:", rows)
print("Column Effects:", cols)
print("Residuals:\n", res)

import numpy as np

def median_polish_simple(data_matrix, max_iter=10):

matrix = np.array(data_matrix, dtype=float)

rows, cols = matrix.shape

row_effects = np.zeros(rows)

col_effects = np.zeros(cols)

overall_effect = 0.0

residuals = np.copy(matrix)

for _ in range(max_iter):

# Remove overall effect from residuals for row/col median calculation

residuals_centered = residuals - overall_effect

# Estimate row effects and remove from residuals

new_row_effects = np.median(residuals_centered, axis=1)

residuals -= new_row_effects[:, np.newaxis]

row_effects += new_row_effects

# Estimate column effects and remove from residuals

new_col_effects = np.median(residuals, axis=0)

residuals -= new_col_effects[np.newaxis, :]

col_effects += new_col_effects

# Estimate overall effect and remove from residuals

new_overall_effect = np.median(residuals)

residuals -= new_overall_effect

overall_effect += new_overall_effect

return overall_effect, row_effects, col_effects, residuals

# Sample data matrix

data_matrix = np.array([

[10, 12, 11],

[15, 18, 16],

[20, 21, 19]

])

overall, rows, cols, res = median_polish_simple(data_matrix)

print("Original Matrix:\n", data_matrix)

print("\nOverall Effect:", overall)

print("Row Effects:", rows)

print("Column Effects:", cols)

print("Residuals:\n", res)

২. ট্রাইমিন (Trimean):

ট্রাইমিন হলো ডেটার মধ্যমা (median) এবং দুটি কোয়ার্টাইলের (first and third quartiles) একটি ওজনযুক্ত গড়। এটি গড়ের চেয়ে আউটলায়ারগুলির প্রতি কম সংবেদনশীল এবং মধ্যমার চেয়ে ডেটার বিতরণের একটি ভাল ধারণা দেয়।
ট্রাইমিন = $(Q1 + 2 \* Q2 + Q3) / 4$ যেখানে $Q 1$ প্রথম কোয়ার্টাইল, $Q 2$ মধ্যমা (দ্বিতীয় কোয়ার্টাইল), এবং $Q 3$ তৃতীয় কোয়ার্টাইল।


import numpy as np

def calculate_trimean(data):
    q1 = np.percentile(data, 25)
    median = np.median(data)
    q3 = np.percentile(data, 75)
    trimean = (q1 + 2 * median + q3) / 4
    return trimean

# Sample data
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100] # Data with an outlier

print(f"Median: {np.median(data)}")
print(f"Mean: {np.mean(data)}")
print(f"Trimean: {calculate_trimean(data)}")

import numpy as np

def calculate_trimean(data):

q1 = np.percentile(data, 25)

median = np.median(data)

q3 = np.percentile(data, 75)

trimean = (q1 + 2 * median + q3) / 4

return trimean

# Sample data

data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100] # Data with an outlier

print(f"Median: {np.median(data)}")

print(f"Mean: {np.mean(data)}")

print(f"Trimean: {calculate_trimean(data)}")

৩. অর্ডিনেশন (Ordination):

অর্ডিনেশন হলো একটি পরিসংখ্যানগত কৌশল যা জটিল ডেটাসেটের অন্তর্নিহিত কাঠামো প্রকাশ করতে ব্যবহৃত হয়, বিশেষ করে পরিবেশগত বিজ্ঞান বা ইকোলজিতে যেখানে একাধিক প্রজাতি এবং পরিবেশগত ভেরিয়েবল থাকে। এর লক্ষ্য হলো ডেটা পয়েন্ট (যেমন, প্লট, সাইট) বা ভেরিয়েবল (যেমন, প্রজাতি) গুলিকে একটি নিম্ন-মাত্রিক স্থানে সাজানো যাতে তাদের মধ্যে বিদ্যমান সম্পর্ক বা সাদৃশ্য দৃশ্যমান হয়। PCA এবং MDS-এর মতো ডাইমেনশনালিটি রিডাকশন কৌশলগুলিও অর্ডিনেশনের অন্তর্ভুক্ত। অন্যান্য অর্ডিনেশন পদ্ধতিগুলির মধ্যে রয়েছে:

Correspondence Analysis (CA): ক্যাটাগরিক্যাল ডেটা বা ফ্রিকোয়েন্সি টেবিলের জন্য ব্যবহৃত হয়।
Canonical Correspondence Analysis (CCA): পরিবেশগত ভেরিয়েবলগুলির সাথে প্রজাতির ডেটার সম্পর্ক পরীক্ষা করে।

পাইথনে scipy.stats এবং scikit-learn এর কিছু অংশ অর্ডিনেশন-সম্পর্কিত কাজে ব্যবহার করা যেতে পারে, এবং skbio (scikit-bio) এর মতো বায়োইনফরমেটিক্স লাইব্রেরিতে নির্দিষ্ট অর্ডিনেশন মেথড উপলব্ধ।


from scipy.stats import chi2_contingency
import numpy as np

# A very simple example (not full ordination, but showing a related concept with categorical data)
# Let's say we have counts of species (rows) in different sites (columns)
# This is a contingency table, which CA could analyze.

# Example: Counts of different tree species in 3 different forest plots
data_counts = np.array([
    [10, 5, 2],  # Species A counts in plot 1, 2, 3
    [3, 8, 15],  # Species B counts
    [7, 2, 8]   # Species C counts
])

# For a full Ordination plot like CA or CCA, specialized libraries would be needed,
# or a more complex implementation.
# Here, we can just look at a chi-squared test for association for illustrative purposes
# as it's a precursor to CA.
chi2, p_value, dof, expected = chi2_contingency(data_counts)

print("Contingency Table:")
print(data_counts)
print(f"\nChi-squared statistic: {chi2:.2f}")
print(f"P-value: {p_value:.3f}")
print("Interpretation: If p-value < 0.05, there's a significant association between species distribution and plots.")

from scipy.stats import chi2_contingency

import numpy as np

# A very simple example (not full ordination, but showing a related concept with categorical data)

# Let's say we have counts of species (rows) in different sites (columns)

# This is a contingency table, which CA could analyze.

# Example: Counts of different tree species in 3 different forest plots

data_counts = np.array([

[10, 5, 2], # Species A counts in plot 1, 2, 3

[3, 8, 15], # Species B counts

[7, 2, 8] # Species C counts

])

# For a full Ordination plot like CA or CCA, specialized libraries would be needed,

# or a more complex implementation.

# Here, we can just look at a chi-squared test for association for illustrative purposes

# as it's a precursor to CA.

chi2, p_value, dof, expected = chi2_contingency(data_counts)

print("Contingency Table:")

print(data_counts)

print(f"\nChi-squared statistic: {chi2:.2f}")

print(f"P-value: {p_value:.3f}")

print("Interpretation: If p-value < 0.05, there's a significant association between species distribution and plots.")

আমি আশা করি, এই বিস্তারিত আলোচনা ও উদাহরণগুলো আপনাকে EDA-এর প্রতিটি টপিক সম্পর্কে জানতে সাহায্য করবে। ডেটা অ্যানালাইসিসে এই কৌশলগুলি ডেটার গভীরে প্রবেশ করতে এবং লুকানো প্যাটার্ন, সম্পর্ক ও অন্তর্দৃষ্টি আবিষ্কার করতে অত্যন্ত কার্যকর।