局部近似

训练深度神经网络本质上是一个压缩任务。我们希望将训练数据分布表示为由一组矩阵参数化的函数。分布越复杂,所需的参数就越多。近似整个分布的理由是,我们可以在推理时使用相同的模型和权重,对任何有效点进行前向传播。

但是,如果我们的模型在推理时即时训练呢?那么,在传播 时,我们只需要对 周围的局部分布进行建模。由于局部区域的维度应该比整个训练集低得多,因此一个更简单的模型就足够了!

这就是局部近似或局部回归背后的思想。让我们考虑一个简单的回归任务。

任务

我们得到了以下数据的 个样本:

其中

Loading...
Loading...
绘图代码
from pathlib import Path

import numpy as np
import plotly.graph_objects as go

# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon

# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)

# Create the plot
fig = go.Figure()

# Add scatter points for noisy data
fig.add_trace(
    go.Scatter(
        x=X,
        y=Y,
        mode="markers",
        name="Noisy Data",
        marker=dict(color="gray"),
    )
)

# Add true function
fig.add_trace(
    go.Scatter(
        x=x_true,
        y=y_true,
        mode="lines",
        name="True Function",
        line=dict(color="red"),
    )
)

# Update layout shared across themes
fig.update_layout(
    autosize=True,
    title="Data",
    xaxis_title="X",
    yaxis_title="Y",
)

# Theme configuration aligning with dario.css colors
themes = [
    {
        "name": "light",
        "template": "plotly_white",
        "font_color": "#141413",
        "background": "#f0efea",
        "axis_color": "#141413",
        "gridcolor": "rgba(20, 20, 19, 0.2)",
    },
    {
        "name": "dark",
        "template": "plotly_dark",
        "font_color": "#f0efea",
        "background": "#141413",
        "axis_color": "#f0efea",
        "gridcolor": "rgba(240, 239, 234, 0.2)",
    },
]

output_dir = Path(__file__).resolve().parents[3] / "static"
output_dir.mkdir(parents=True, exist_ok=True)

for theme in themes:
    themed_fig = go.Figure(fig)
    themed_fig.update_layout(
        template=theme["template"],
        font=dict(color=theme["font_color"]),
        paper_bgcolor=theme["background"],
        plot_bgcolor=theme["background"],
    )
    themed_fig.update_xaxes(
        showline=True,
        linecolor=theme["axis_color"],
        tickcolor=theme["axis_color"],
        tickfont=dict(color=theme["axis_color"]),
        title_font=dict(color=theme["axis_color"]),
        gridcolor=theme["gridcolor"],
        zeroline=False,
    )
    themed_fig.update_yaxes(
        showline=True,
        linecolor=theme["axis_color"],
        tickcolor=theme["axis_color"],
        tickfont=dict(color=theme["axis_color"]),
        title_font=dict(color=theme["axis_color"]),
        gridcolor=theme["gridcolor"],
        zeroline=False,
    )

    filename = output_dir / f"local_approximation_data_{theme['name']}.html"
    themed_fig.write_html(filename)
    print(f"Saved plot to {filename}")

# Show the plot
fig.show()

我们将数据集记为 ,其中包含样本

我们的任务是通过数据拟合一条合理的曲线,使其近似匹配真实函数。我们将这条曲线记为

K近邻算法

给定某个 ,一种方法是取 个最近值 ,并将它们的 值平均作为估计值。即,

其中 表示 个最近点。

Loading...
Loading...
绘图代码
from pathlib import Path

import numpy as np
import plotly.graph_objects as go

# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon

# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)

# k-NN for a range of k
x_curve = np.arange(0, 1, 0.01)
k_range = range(1, 21)
y_curves_knn = {}

for k in k_range:
    y_curve = []
    for x in x_curve:
        distances = np.square(X - x)
        nearest_indices = np.argsort(distances)[:k]
        y_curve.append(np.mean(Y[nearest_indices]))
    y_curves_knn[k] = y_curve

# Create the Plotly figure
fig = go.Figure()

# Add static traces
fig.add_trace(
    go.Scatter(x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray"))
)

fig.add_trace(
    go.Scatter(
        x=x_true, y=y_true, mode="lines", name="True Function", line=dict(color="red")
    )
)

# Add the first k-NN curve (k=13, the default slider position)
initial_k = 13
fig.add_trace(
    go.Scatter(
        x=x_curve,
        y=y_curves_knn[initial_k],
        mode="lines",
        name="k-NN Curve",
        line=dict(color="yellow"),
    )
)

# Define slider steps
steps = []
for k in k_range:
    step = dict(
        method="update",
        args=[
            {"y": [Y, y_true, y_curves_knn[k]]},  # Update y-data for the traces
            {
                "title": f"Interactive k-NN Curve with Slider for k = {k}"
            },  # Update the title dynamically
        ],
        label=f"{k}",
    )
    steps.append(step)

# Add slider to the layout
sliders = [
    dict(
        active=initial_k - 1,
        currentvalue={"prefix": "k = "},
        pad={"t": 50},
        steps=steps,
    )
]

fig.update_layout(
    sliders=sliders,
    autosize=True,
    title=f"Interactive k-NN Curve with Slider for k = {initial_k}",
    xaxis_title="X",
    yaxis_title="Y",
)

themes = [
    {
        "name": "light",
        "template": "plotly_white",
        "font_color": "#141413",
        "background": "#f0efea",
        "axis_color": "#141413",
        "gridcolor": "rgba(20, 20, 19, 0.2)",
    },
    {
        "name": "dark",
        "template": "plotly_dark",
        "font_color": "#f0efea",
        "background": "#141413",
        "axis_color": "#f0efea",
        "gridcolor": "rgba(240, 239, 234, 0.2)",
    },
]

output_dir = Path(__file__).resolve().parents[3] / "static"
output_dir.mkdir(parents=True, exist_ok=True)

for theme in themes:
    themed_fig = go.Figure(fig)
    themed_fig.update_layout(
        template=theme["template"],
        font=dict(color=theme["font_color"]),
        paper_bgcolor=theme["background"],
        plot_bgcolor=theme["background"],
    )
    themed_fig.update_xaxes(
        showline=True,
        linecolor=theme["axis_color"],
        tickcolor=theme["axis_color"],
        tickfont=dict(color=theme["axis_color"]),
        title_font=dict(color=theme["axis_color"]),
        gridcolor=theme["gridcolor"],
        zeroline=False,
    )
    themed_fig.update_yaxes(
        showline=True,
        linecolor=theme["axis_color"],
        tickcolor=theme["axis_color"],
        tickfont=dict(color=theme["axis_color"]),
        title_font=dict(color=theme["axis_color"]),
        gridcolor=theme["gridcolor"],
        zeroline=False,
    )

    html_path = output_dir / f"knn_slider_{theme['name']}.html"
    themed_fig.write_html(html_path)
    print(f"Saved interactive plot to {html_path}")

# Show the plot
fig.show()

通过使用滑块,你可以看到较大的 值会导致曲线更平滑,但较小的 值曲线会包含一些噪声。在极端情况下, 会完全追踪训练数据,而 会给出一个平坦的全局平均值。

纳达拉亚-沃森核回归

与其将数据子集限制为 个点,不如考虑集合中的所有点,但根据每个点与 的接近程度来加权其贡献。考虑以下模型

其中 是一个核函数,我们将用它作为接近度的度量。

该函数由 参数化,称为带宽,它控制数据中哪些范围的 值会影响 的输出。如果我们绘制这些函数,这一点就会变得清晰。

核函数

下图绘制的是

其中 使得 在其支持域上积分为

Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
绘图代码
from pathlib import Path

import numpy as np
import plotly.graph_objects as go
from scipy.integrate import quad


# Define kernel functions
def epanechnikov_kernel(u):
    return np.maximum(0, 0.75 * (1 - u**2))


def tricube_kernel(u):
    return np.maximum(0, (1 - np.abs(u) ** 3) ** 3)


def gaussian_kernel(u):
    return np.exp(-0.5 * u**2) / np.sqrt(2 * np.pi)


def renormalized_kernel(kernel_func, u_range, bandwidth):
    def kernel_with_lambda(u):
        scaled_u = u / bandwidth
        normalization_factor, _ = quad(lambda v: kernel_func(v / bandwidth), *u_range)
        return kernel_func(scaled_u) / normalization_factor

    return kernel_with_lambda


# Kernel function plot generator
def generate_kernel_plot(
    kernel_name, kernel_func, x_range, u_range, lambda_values, y_range
):
    fig = go.Figure()

    # Initial lambda
    initial_lambda = lambda_values[len(lambda_values) // 2]

    # Generate initial kernel curve
    x = np.linspace(*x_range, 500)
    kernel_with_lambda = renormalized_kernel(kernel_func, u_range, initial_lambda)
    y = kernel_with_lambda(x)
    fig.add_trace(
        go.Scatter(
            x=x,
            y=y,
            mode="lines",
            name=f"{kernel_name} Kernel (λ={initial_lambda:.2f})",
            line=dict(color="green"),
        )
    )

    # Create frames for the slider
    frames = []
    for bandwidth in lambda_values:
        kernel_with_lambda = renormalized_kernel(kernel_func, u_range, bandwidth)
        y = kernel_with_lambda(x)
        frames.append(
            go.Frame(
                data=[
                    go.Scatter(
                        x=x,
                        y=y,
                        mode="lines",
                        name=f"{kernel_name} Kernel (λ={bandwidth:.2f})",
                        line=dict(color="green"),
                    )
                ],
                name=f"{bandwidth:.2f}",
            )
        )

    # Add frames to the figure
    fig.frames = frames

    # Add slider
    sliders = [
        {
            "active": len(lambda_values) // 2,
            "currentvalue": {"prefix": "Bandwidth λ: "},
            "steps": [
                {
                    "args": [
                        [f"{bandwidth:.2f}"],
                        {"frame": {"duration": 0, "redraw": True}, "mode": "immediate"},
                    ],
                    "label": f"{bandwidth:.2f}",
                    "method": "animate",
                }
                for bandwidth in lambda_values
            ],
        }
    ]

    # Update layout
    fig.update_layout(
        title=f"{kernel_name} Kernel Function",
        xaxis_title="u",
        yaxis_title="K(u)",
        yaxis_range=y_range,
        sliders=sliders,
        autosize=True,
        updatemenus=[
            {
                "buttons": [
                    # {
                    #     "args": [
                    #         None,
                    #         {
                    #             "frame": {"duration": 500, "redraw": True},
                    #             "fromcurrent": True,
                    #         },
                    #     ],
                    #     "label": "Play",
                    #     "method": "animate",
                    # },
                    # {
                    #     "args": [
                    #         [None],
                    #         {
                    #             "frame": {"duration": 0, "redraw": True},
                    #             "mode": "immediate",
                    #         },
                    #     ],
                    #     "label": "Pause",
                    #     "method": "animate",
                    # },
                ],
                "direction": "left",
                "pad": {"r": 10, "t": 87},
                "showactive": False,
                "type": "buttons",
                "x": 0.1,
                "xanchor": "right",
                "y": 0,
                "yanchor": "top",
            }
        ],
    )

    return fig


# Kernel functions
kernels = {
    "Epanechnikov": epanechnikov_kernel,
    "Tricube": tricube_kernel,
    "Gaussian": gaussian_kernel,
}

# Parameters
x_range_plot = (-3, 3)  # Range of u values for the plot
u_range_integration = (-3, 3)  # Range for normalization
lambda_values = np.linspace(0.01, 2, 20)  # Linear lambda values from 0.01 to 2
y_range_plot = (0, 1.5)  # Adjusted range to fit the normalized functions

# Generate and display plots for each kernel
themes = [
    {
        "name": "light",
        "template": "plotly_white",
        "font_color": "#141413",
        "background": "#f0efea",
        "axis_color": "#141413",
        "gridcolor": "rgba(20, 20, 19, 0.2)",
    },
    {
        "name": "dark",
        "template": "plotly_dark",
        "font_color": "#f0efea",
        "background": "#141413",
        "axis_color": "#f0efea",
        "gridcolor": "rgba(240, 239, 234, 0.2)",
    },
]

output_dir = Path(__file__).resolve().parents[3] / "static"
output_dir.mkdir(parents=True, exist_ok=True)

for kernel_name, kernel_func in kernels.items():
    fig = generate_kernel_plot(
        kernel_name,
        kernel_func,
        x_range_plot,
        u_range_integration,
        lambda_values,
        y_range_plot,
    )

    # Save themed figures to HTML files
    for theme in themes:
        themed_fig = go.Figure(fig)
        themed_fig.update_layout(
            template=theme["template"],
            font=dict(color=theme["font_color"]),
            paper_bgcolor=theme["background"],
            plot_bgcolor=theme["background"],
        )
        themed_fig.update_xaxes(
            showline=True,
            linecolor=theme["axis_color"],
            tickcolor=theme["axis_color"],
            tickfont=dict(color=theme["axis_color"]),
            title_font=dict(color=theme["axis_color"]),
            gridcolor=theme["gridcolor"],
            zeroline=False,
        )
        themed_fig.update_yaxes(
            showline=True,
            linecolor=theme["axis_color"],
            tickcolor=theme["axis_color"],
            tickfont=dict(color=theme["axis_color"]),
            title_font=dict(color=theme["axis_color"]),
            gridcolor=theme["gridcolor"],
            zeroline=False,
        )

        filename = (
            output_dir
            / f"{kernel_name}_dynamic_normalization_kernel_function_{theme['name']}.html"
        )
        themed_fig.write_html(filename, auto_play=False)
        print(f"Saved {kernel_name} kernel plot to {filename}")

    # Show the figure
    fig.show()

结果

我们现在绘制每个核函数的结果。每个图都有一个 滑块,可以实时控制输出。

Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
绘图代码
from pathlib import Path

import numpy as np
import plotly.graph_objects as go


# Define kernel functions
def epanechnikov_kernel(u):
    return np.maximum(0, 0.75 * (1 - u**2))


def tricube_kernel(u):
    return np.maximum(0, (1 - np.abs(u) ** 3) ** 3)


def gaussian_kernel(u):
    return np.exp(-0.5 * u**2) / np.sqrt(2 * np.pi)


# Kernel regression function
def kernel_regression(X, Y, x_curve, kernel_func, bandwidth):
    y_curve = []
    for x in x_curve:
        distances = np.abs(X - x) / bandwidth
        weights = kernel_func(distances)
        weighted_average = (
            np.sum(weights * Y) / np.sum(weights) if np.sum(weights) > 0 else 0
        )
        y_curve.append(weighted_average)
    return y_curve


# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon

# True curve
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)

# Points for kernel estimation
x_curve = x_true

# Kernel functions
kernels = {
    "Epanechnikov": epanechnikov_kernel,
    "Tricube": tricube_kernel,
    "Gaussian": gaussian_kernel,
}

# Range of bandwidths for the slider in logspace
lambda_values = np.logspace(-2, 0, 20)  # From 0.01 to 1

# Generate separate plots for each kernel
themes = [
    {
        "name": "light",
        "template": "plotly_white",
        "font_color": "#141413",
        "background": "#f0efea",
        "axis_color": "#141413",
        "gridcolor": "rgba(20, 20, 19, 0.2)",
    },
    {
        "name": "dark",
        "template": "plotly_dark",
        "font_color": "#f0efea",
        "background": "#141413",
        "axis_color": "#f0efea",
        "gridcolor": "rgba(240, 239, 234, 0.2)",
    },
]

output_dir = Path(__file__).resolve().parents[3] / "static"
output_dir.mkdir(parents=True, exist_ok=True)

# Generate separate plots for each kernel
for kernel_name, kernel_func in kernels.items():
    fig = go.Figure()

    # Add scatter points for noisy data
    fig.add_trace(
        go.Scatter(
            x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray")
        )
    )

    # Add true function
    fig.add_trace(
        go.Scatter(
            x=x_true,
            y=y_true,
            mode="lines",
            name="True Function",
            line=dict(color="red"),
        )
    )

    # Add initial kernel curve
    initial_bandwidth = lambda_values[0]
    y_curve = kernel_regression(X, Y, x_curve, kernel_func, initial_bandwidth)
    fig.add_trace(
        go.Scatter(
            x=x_curve,
            y=y_curve,
            mode="lines",
            name=f"Nadaraya-Watson ({kernel_name})",
            line=dict(color="green"),
        )
    )

    # Create frames for the slider
    frames = []
    for bandwidth in lambda_values:
        y_curve = kernel_regression(X, Y, x_curve, kernel_func, bandwidth)
        frames.append(
            go.Frame(
                data=[
                    go.Scatter(
                        x=X,
                        y=Y,
                        mode="markers",
                        name="Noisy Data",
                        marker=dict(color="gray"),
                    ),
                    go.Scatter(
                        x=x_true,
                        y=y_true,
                        mode="lines",
                        name="True Function",
                        line=dict(color="red"),
                    ),
                    go.Scatter(
                        x=x_curve,
                        y=y_curve,
                        mode="lines",
                        name=f"Nadaraya-Watson ({kernel_name})",
                        line=dict(color="green"),
                    ),
                ],
                name=f"{bandwidth:.2f}",
            )
        )

    # Add frames to the figure
    fig.frames = frames

    # Add slider
    sliders = [
        {
            "active": 0,
            "currentvalue": {"prefix": "Bandwidth λ: "},
            "steps": [
                {
                    "args": [
                        [f"{bandwidth:.2f}"],
                        {"frame": {"duration": 0, "redraw": True}, "mode": "immediate"},
                    ],
                    "label": f"{bandwidth:.2f}",
                    "method": "animate",
                }
                for bandwidth in lambda_values
            ],
        }
    ]

    # Update layout
    fig.update_layout(
        autosize=True,
        title=f"Nadaraya-Watson Kernel Regression ({kernel_name} Kernel)",
        xaxis_title="X",
        yaxis_title="Y",
        sliders=sliders,
        updatemenus=[
            {
                "buttons": [
                    {
                        "args": [
                            None,
                            {
                                "frame": {"duration": 500, "redraw": True},
                                "fromcurrent": True,
                            },
                        ],
                        "label": "Play",
                        "method": "animate",
                    },
                    {
                        "args": [
                            [None],
                            {
                                "frame": {"duration": 0, "redraw": True},
                                "mode": "immediate",
                            },
                        ],
                        "label": "Pause",
                        "method": "animate",
                    },
                ],
                "direction": "left",
                "pad": {"r": 10, "t": 87},
                "showactive": False,
                "type": "buttons",
                "x": 0.1,
                "xanchor": "right",
                "y": 0,
                "yanchor": "top",
            }
        ],
    )

    # Save the figure to an HTML file per theme
    for theme in themes:
        themed_fig = go.Figure(fig)
        themed_fig.update_layout(
            template=theme["template"],
            font=dict(color=theme["font_color"]),
            paper_bgcolor=theme["background"],
            plot_bgcolor=theme["background"],
        )
        themed_fig.update_xaxes(
            showline=True,
            linecolor=theme["axis_color"],
            tickcolor=theme["axis_color"],
            tickfont=dict(color=theme["axis_color"]),
            title_font=dict(color=theme["axis_color"]),
            gridcolor=theme["gridcolor"],
            zeroline=False,
        )
        themed_fig.update_yaxes(
            showline=True,
            linecolor=theme["axis_color"],
            tickcolor=theme["axis_color"],
            tickfont=dict(color=theme["axis_color"]),
            title_font=dict(color=theme["axis_color"]),
            gridcolor=theme["gridcolor"],
            zeroline=False,
        )

        filename = output_dir / f"{kernel_name}_kernel_regression_{theme['name']}.html"
        themed_fig.write_html(filename, auto_play=False)
        print(f"Saved {kernel_name} kernel plot to {filename}")

    # Show the figure
    fig.show()

我们可以看到,数据的简单加权平均值能够很好地模拟正弦曲线。

局部线性回归

在Nadaraya-Watson核回归中,我们通过核函数 定义的邻域内进行加权平均。这种方法的一个潜在问题是局部邻域内的平滑插值,因为我们实际上并没有假设该区域遵循任何模型。

如果我们假设每个区域都是局部线性的呢?那么,我们可以求解最小二乘拟合并自由插值!

区域:$k$-近邻

让我们将局部区域定义为输入点的 个最近邻。设 为对应的 值。最小二乘拟合系数为

Loading...
Loading...
绘图代码
from pathlib import Path

import numpy as np
import plotly.graph_objects as go

# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon

# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)


# k-NN Local Linear Regression
def knn_linear_regression(X, Y, x_curve, k_range):
    y_curves = {}
    for k in k_range:
        y_curve = []
        for x in x_curve:
            # Find k nearest neighbors
            distances = np.abs(X - x)
            nearest_indices = np.argsort(distances)[:k]

            # Select k nearest neighbors
            X_knn = X[nearest_indices]
            Y_knn = Y[nearest_indices]

            # Create design matrix for k-nearest neighbors
            X_design = np.vstack((np.ones_like(X_knn), X_knn)).T

            # Solve for beta using ordinary least squares
            beta = np.linalg.pinv(X_design.T @ X_design) @ X_design.T @ Y_knn

            # Predict y-value
            y_curve.append(beta[0] + beta[1] * x)
        y_curves[k] = y_curve
    return y_curves


# Common variables
x_curve = np.arange(0, 1, 0.01)
k_range = range(1, 21)  # Values of k from 1 to 20
initial_k = 10  # Default value of k

# Compute LLR using k-NN
y_curves_knn = knn_linear_regression(X, Y, x_curve, k_range)

# Create the Plotly figure
fig = go.Figure()

# Add static traces
fig.add_trace(
    go.Scatter(x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray"))
)

fig.add_trace(
    go.Scatter(
        x=x_true, y=y_true, mode="lines", name="True Function", line=dict(color="red")
    )
)

# Add the first k-NN curve (k=initial_k)
fig.add_trace(
    go.Scatter(
        x=x_curve,
        y=y_curves_knn[initial_k],
        mode="lines",
        name="k-NN Curve",
        line=dict(color="yellow"),
    )
)

# Define slider steps
steps = []
for k in k_range:
    step = dict(
        method="update",
        args=[
            {"y": [Y, y_true, y_curves_knn[k]]},  # Update y-data for the traces
            {
                "title": f"k-NN Local Linear Regression Curve with k = {k}"
            },  # Update the title dynamically
        ],
        label=f"{k}",
    )
    steps.append(step)

# Add slider to the layout
sliders = [
    dict(
        active=k_range.index(initial_k),  # Use the index of initial_k
        currentvalue={"prefix": "k = "},
        pad={"t": 50},
        steps=steps,
    )
]

fig.update_layout(
    autosize=True,
    sliders=sliders,
    title=f"k-NN Local Linear Regression Curve with k = {initial_k}",
    xaxis_title="X",
    yaxis_title="Y",
)

themes = [
    {
        "name": "light",
        "template": "plotly_white",
        "font_color": "#141413",
        "background": "#f0efea",
        "axis_color": "#141413",
        "gridcolor": "rgba(20, 20, 19, 0.2)",
    },
    {
        "name": "dark",
        "template": "plotly_dark",
        "font_color": "#f0efea",
        "background": "#141413",
        "axis_color": "#f0efea",
        "gridcolor": "rgba(240, 239, 234, 0.2)",
    },
]

output_dir = Path(__file__).resolve().parents[3] / "static"
output_dir.mkdir(parents=True, exist_ok=True)

for theme in themes:
    themed_fig = go.Figure(fig)
    themed_fig.update_layout(
        template=theme["template"],
        font=dict(color=theme["font_color"]),
        paper_bgcolor=theme["background"],
        plot_bgcolor=theme["background"],
    )
    themed_fig.update_xaxes(
        showline=True,
        linecolor=theme["axis_color"],
        tickcolor=theme["axis_color"],
        tickfont=dict(color=theme["axis_color"]),
        title_font=dict(color=theme["axis_color"]),
        gridcolor=theme["gridcolor"],
        zeroline=False,
    )
    themed_fig.update_yaxes(
        showline=True,
        linecolor=theme["axis_color"],
        tickcolor=theme["axis_color"],
        tickfont=dict(color=theme["axis_color"]),
        title_font=dict(color=theme["axis_color"]),
        gridcolor=theme["gridcolor"],
        zeroline=False,
    )

    html_path = output_dir / f"knn_slider_llr_{theme['name']}.html"
    themed_fig.write_html(html_path)
    print(f"Saved interactive k-NN plot to {html_path}")

# Show the plot
fig.show()

我们可以看到,当 较小时,输出可能会显得相当粗糙。

区域:核函数

或许我们可以借鉴 Nadaraya-Watson 核函数的一些思想。我们希望不同程度地考虑训练集中的所有点,局部区域内的点赋予较高权重,区域外的点赋予较低权重。

为此,我们可以使用加权最小二乘目标函数,权重为 。其解为

绘制不同核函数 的结果:

Loading...
Loading...
Loading...
Loading...
Loading...
Loading...
绘图代码
from pathlib import Path

import numpy as np
import plotly.graph_objects as go

# Generate data
np.random.seed(42)
n_points = 100
X = np.random.uniform(0, 1, n_points)
epsilon = np.random.normal(0, 1 / 3, n_points)
Y = np.sin(4 * X) + epsilon

# True function
x_true = np.linspace(0, 1, 500)
y_true = np.sin(4 * x_true)


# Kernels
def gaussian_kernel(u):
    return np.exp(-0.5 * u**2)


def epanechnikov_kernel(u):
    return np.maximum(0, 1 - u**2)


def tricube_kernel(u):
    return np.maximum(0, (1 - np.abs(u) ** 3) ** 3)


# Local Linear Regression for a specific kernel
def local_linear_regression(X, Y, x_curve, bandwidths, kernel):
    y_curves = {}
    for λ in bandwidths:
        λ_rounded = round(λ, 2)
        y_curve = []
        for x in x_curve:
            # Calculate weights using the specified kernel
            distances = (X - x) / λ
            weights = kernel(distances)
            W = np.diag(weights)

            # Create design matrix
            X_design = np.vstack((np.ones_like(X), X)).T

            # Solve for beta using weighted least squares
            beta = np.linalg.pinv(X_design.T @ W @ X_design) @ X_design.T @ W @ Y

            # Predict y-value
            y_curve.append(beta[0] + beta[1] * x)
        y_curves[λ_rounded] = y_curve
    return y_curves


# Common variables
x_curve = np.arange(0, 1, 0.01)
bandwidths = np.linspace(0.05, 0.5, 20)
initial_λ = bandwidths[len(bandwidths) // 2]

# Generate plots for each kernel
kernels = {
    "Gaussian Kernel": gaussian_kernel,
    "Epanechnikov Kernel": epanechnikov_kernel,
    "Tricube Kernel": tricube_kernel,
}
plots = []

for kernel_name, kernel_func in kernels.items():
    # Compute LLR with the specified kernel
    y_curves = local_linear_regression(X, Y, x_curve, bandwidths, kernel_func)

    # Create the Plotly figure
    fig = go.Figure()

    # Add static traces
    fig.add_trace(
        go.Scatter(
            x=X, y=Y, mode="markers", name="Noisy Data", marker=dict(color="gray")
        )
    )

    fig.add_trace(
        go.Scatter(
            x=x_true,
            y=y_true,
            mode="lines",
            name="True Function",
            line=dict(color="red"),
        )
    )

    # Add the first LLR curve (using the middle value of bandwidths)
    fig.add_trace(
        go.Scatter(
            x=x_curve,
            y=y_curves[round(initial_λ, 2)],
            mode="lines",
            name=f"{kernel_name} Curve",
            line=dict(color="yellow"),
        )
    )

    # Define slider steps
    steps = []
    for λ in bandwidths:
        λ_rounded = round(λ, 2)
        step = dict(
            method="update",
            args=[
                {"y": [Y, y_true, y_curves[λ_rounded]]},  # Update y-data for the traces
                {
                    "title": f"LLR: {kernel_name} with Bandwidth λ = {λ_rounded}"
                },  # Update the title dynamically
            ],
            label=f"{λ_rounded}",
        )
        steps.append(step)

    # Add slider to the layout
    sliders = [
        dict(
            active=len(bandwidths) // 2,  # Use the index of the middle bandwidth
            currentvalue={"prefix": "λ = "},
            pad={"t": 50},
            steps=steps,
        )
    ]

    fig.update_layout(
        autosize=True,
        sliders=sliders,
        title=f"LLR: {kernel_name} with Bandwidth λ = {round(initial_λ, 2)}",
        xaxis_title="X",
        yaxis_title="Y",
    )

    plots.append(fig)

# Show and save the plots with themed backgrounds
themes = [
    {
        "name": "light",
        "template": "plotly_white",
        "font_color": "#141413",
        "background": "#f0efea",
        "axis_color": "#141413",
        "gridcolor": "rgba(20, 20, 19, 0.2)",
    },
    {
        "name": "dark",
        "template": "plotly_dark",
        "font_color": "#f0efea",
        "background": "#141413",
        "axis_color": "#f0efea",
        "gridcolor": "rgba(240, 239, 234, 0.2)",
    },
]

output_dir = Path(__file__).resolve().parents[3] / "static"
output_dir.mkdir(parents=True, exist_ok=True)

for kernel_name, fig in zip(kernels.keys(), plots):
    fig.show()
    for theme in themes:
        themed_fig = go.Figure(fig)
        themed_fig.update_layout(
            template=theme["template"],
            font=dict(color=theme["font_color"]),
            paper_bgcolor=theme["background"],
            plot_bgcolor=theme["background"],
        )
        themed_fig.update_xaxes(
            showline=True,
            linecolor=theme["axis_color"],
            tickcolor=theme["axis_color"],
            tickfont=dict(color=theme["axis_color"]),
            title_font=dict(color=theme["axis_color"]),
            gridcolor=theme["gridcolor"],
            zeroline=False,
        )
        themed_fig.update_yaxes(
            showline=True,
            linecolor=theme["axis_color"],
            tickcolor=theme["axis_color"],
            tickfont=dict(color=theme["axis_color"]),
            title_font=dict(color=theme["axis_color"]),
            gridcolor=theme["gridcolor"],
            zeroline=False,
        )

        filename = (
            output_dir
            / f"llr_{kernel_name.lower().replace(' ', '_')}_{theme['name']}.html"
        )
        themed_fig.write_html(filename)
        print(f"Saved interactive plot for {kernel_name} to {filename}")

我觉得结果看起来平滑多了!

参考文献

  • 统计学习基础 - Hastie, Tibshirani, 和 Friedman (2009). 一本关于数据挖掘、推断和预测的全面指南。了解更多.