Module pipeline_dp.aggregate_params
Contains utility classes used for specifying DP aggregation parameters, noise types, and norms.
Expand source code
"""Contains utility classes used for specifying DP aggregation parameters, noise types, and norms."""
from dataclasses import dataclass
from enum import Enum
from typing import Any, Iterable, Callable, Union
class Metrics(Enum):
COUNT = 'count'
PRIVACY_ID_COUNT = 'privacy_id_count'
SUM = 'sum'
MEAN = 'mean'
VAR = 'variance'
class NoiseKind(Enum):
LAPLACE = 'laplace'
GAUSSIAN = 'gaussian'
def convert_to_mechanism_type(self):
if self.value == NoiseKind.LAPLACE.value:
return MechanismType.LAPLACE
elif self.value == NoiseKind.GAUSSIAN.value:
return MechanismType.GAUSSIAN
class MechanismType(Enum):
LAPLACE = 'Laplace'
GAUSSIAN = 'Gaussian'
GENERIC = 'Truncated Geometric'
class NormKind(Enum):
Linf = "linf"
L0 = "l0"
L1 = "l1"
L2 = "l2"
@dataclass
class AggregateParams:
"""Specifies parameters for function DPEngine.aggregate()
Args:
noise_kind: Kind of noise to use for the DP calculations.
metrics: Metrics to compute.
max_partitions_contributed: Bounds the number of partitions in which one
unit of privacy (e.g., a user) can participate.
max_contributions_per_partition: Bounds the number of times one unit of
privacy (e.g. a user) can contribute to a partition.
min_value: Lower bound on a value contributed by a unit of privacy in a partition.
max_value: Upper bound on a value contributed by a unit of privacy in a
partition.
public_partitions: a collection of partition keys that will be present in
the result.
"""
noise_kind: NoiseKind
metrics: Iterable[Metrics]
max_partitions_contributed: int
max_contributions_per_partition: int
budget_weight: float = 1
low: float = None # deprecated
high: float = None # deprecated
min_value: float = None
max_value: float = None
public_partitions: Any = None
def __post_init__(self):
if self.low is not None:
raise ValueError(
"AggregateParams: please use min_value instead of low")
if self.high is not None:
raise ValueError(
"AggregateParams: please use max_value instead of high")
def __str__(self):
return f"Metrics: {[m.value for m in self.metrics]}"
# TODO: Think of whether this class should be used for both lowlevel API
# (dp_engine) and highlevel API (private_spark, private_beam, etc.).
@dataclass
class SelectPrivatePartitionsParams:
"""Specifies parameters for differentiallyprivate partition selection.
Args:
max_partitions_contributed: Maximum number of partitions per privacy ID.
The algorithm will drop contributions over this limit. To keep more
data, this should be a good estimate of the realistic upper bound.
Significantly over or underestimating this may increase the amount
of dropped partitions. You can experiment with different values to
select which one retains more partitions.
"""
max_partitions_contributed: int
def __str__(self):
return "Private Partitions"
@dataclass
class SumParams:
"""Specifies parameters for differentiallyprivate sum calculation.
Args:
noise_kind: Kind of noise to use for the DP calculations.
max_partitions_contributed: Bounds the number of partitions in which one
unit of privacy (e.g., a user) can participate.
max_contributions_per_partition: Bounds the number of times one unit of
privacy (e.g. a user) can contribute to a partition.
low: Lower bound on a value contributed by a unit of privacy in a partition.
high: Upper bound on a value contributed by a unit of privacy in a
partition.
public_partitions: A collection of partition keys that will be present in
the result.
partition_extractor: A function for partition id extraction from a collection record.
value_extractor: A function for extraction of value
for which the sum will be calculated.
"""
max_partitions_contributed: int
max_contributions_per_partition: int
min_value: float
max_value: float
partition_extractor: Callable
value_extractor: Callable
low: float = None # deprecated
high: float = None # deprecated
budget_weight: float = 1
noise_kind: NoiseKind = NoiseKind.LAPLACE
public_partitions: Union[Iterable, 'PCollection', 'RDD'] = None
def __post_init__(self):
if self.low is not None:
raise ValueError("SumParams: please use min_value instead of low")
if self.high is not None:
raise ValueError("SumParams: please use max_value instead of high")
@dataclass
class CountParams:
"""Specifies parameters for differentiallyprivate count calculation.
Args:
noise_kind: Kind of noise to use for the DP calculations.
max_partitions_contributed: Bounds the number of partitions in which one
unit of privacy (e.g., a user) can participate.
max_contributions_per_partition: Bounds the number of times one unit of
privacy (e.g. a user) can contribute to a partition.
partition_extractor: A function for partition id extraction from a collection record.
budget_weight: Relative weight of the privacy budget allocated for this
operation.
public_partitions: A collection of partition keys that will be present in
the result.
"""
noise_kind: NoiseKind
max_partitions_contributed: int
max_contributions_per_partition: int
partition_extractor: Callable
budget_weight: float = 1
public_partitions: Union[Iterable, 'PCollection', 'RDD'] = None
@dataclass
class PrivacyIdCountParams:
"""Specifies parameters for differentiallyprivate privacy id count calculation.
Args:
noise_kind: Kind of noise to use for the DP calculations.
max_partitions_contributed: Bounds the number of partitions in which one
unit of privacy (e.g., a user) can participate.
budget_weight: Relative weight of the privacy budget allocated for this
operation.
partition_extractor: A function for partition id extraction from a collection record.
public_partitions: A collection of partition keys that will be present in
the result.
"""
noise_kind: NoiseKind
max_partitions_contributed: int
partition_extractor: Callable
budget_weight: float = 1
public_partitions: Union[Iterable, 'PCollection', 'RDD'] = None
Classes
class AggregateParams (noise_kind: NoiseKind, metrics: Iterable[Metrics], max_partitions_contributed: int, max_contributions_per_partition: int, budget_weight: float = 1, low: float = None, high: float = None, min_value: float = None, max_value: float = None, public_partitions: Any = None)

Specifies parameters for function DPEngine.aggregate()
Args
noise_kind
 Kind of noise to use for the DP calculations.
metrics
 Metrics to compute.
max_partitions_contributed
 Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate.
max_contributions_per_partition
 Bounds the number of times one unit of privacy (e.g. a user) can contribute to a partition.
min_value
 Lower bound on a value contributed by a unit of privacy in a partition.
max_value
 Upper bound on a value contributed by a unit of privacy in a partition.
public_partitions
 a collection of partition keys that will be present in the result.
Expand source code
@dataclass class AggregateParams: """Specifies parameters for function DPEngine.aggregate() Args: noise_kind: Kind of noise to use for the DP calculations. metrics: Metrics to compute. max_partitions_contributed: Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate. max_contributions_per_partition: Bounds the number of times one unit of privacy (e.g. a user) can contribute to a partition. min_value: Lower bound on a value contributed by a unit of privacy in a partition. max_value: Upper bound on a value contributed by a unit of privacy in a partition. public_partitions: a collection of partition keys that will be present in the result. """ noise_kind: NoiseKind metrics: Iterable[Metrics] max_partitions_contributed: int max_contributions_per_partition: int budget_weight: float = 1 low: float = None # deprecated high: float = None # deprecated min_value: float = None max_value: float = None public_partitions: Any = None def __post_init__(self): if self.low is not None: raise ValueError( "AggregateParams: please use min_value instead of low") if self.high is not None: raise ValueError( "AggregateParams: please use max_value instead of high") def __str__(self): return f"Metrics: {[m.value for m in self.metrics]}"
Class variables
var budget_weight : float
var high : float
var low : float
var max_contributions_per_partition : int
var max_partitions_contributed : int
var max_value : float
var metrics : Iterable[Metrics]
var min_value : float
var noise_kind : NoiseKind
var public_partitions : Any
class CountParams (noise_kind: NoiseKind, max_partitions_contributed: int, max_contributions_per_partition: int, partition_extractor: Callable, budget_weight: float = 1, public_partitions: Union[Iterable, ForwardRef('PCollection'), ForwardRef('RDD')] = None)

Specifies parameters for differentiallyprivate count calculation.
Args
noise_kind
 Kind of noise to use for the DP calculations.
max_partitions_contributed
 Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate.
max_contributions_per_partition
 Bounds the number of times one unit of privacy (e.g. a user) can contribute to a partition.
partition_extractor
 A function for partition id extraction from a collection record.
budget_weight
 Relative weight of the privacy budget allocated for this operation.
public_partitions
 A collection of partition keys that will be present in the result.
Expand source code
@dataclass class CountParams: """Specifies parameters for differentiallyprivate count calculation. Args: noise_kind: Kind of noise to use for the DP calculations. max_partitions_contributed: Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate. max_contributions_per_partition: Bounds the number of times one unit of privacy (e.g. a user) can contribute to a partition. partition_extractor: A function for partition id extraction from a collection record. budget_weight: Relative weight of the privacy budget allocated for this operation. public_partitions: A collection of partition keys that will be present in the result. """ noise_kind: NoiseKind max_partitions_contributed: int max_contributions_per_partition: int partition_extractor: Callable budget_weight: float = 1 public_partitions: Union[Iterable, 'PCollection', 'RDD'] = None
Class variables
var budget_weight : float
var max_contributions_per_partition : int
var max_partitions_contributed : int
var noise_kind : NoiseKind
var partition_extractor : Callable
var public_partitions : Union[Iterable, PCollection, RDD]
class MechanismType (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.
Expand source code
class MechanismType(Enum): LAPLACE = 'Laplace' GAUSSIAN = 'Gaussian' GENERIC = 'Truncated Geometric'
Ancestors
 enum.Enum
Class variables
var GAUSSIAN
var GENERIC
var LAPLACE
class Metrics (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.
Expand source code
class Metrics(Enum): COUNT = 'count' PRIVACY_ID_COUNT = 'privacy_id_count' SUM = 'sum' MEAN = 'mean' VAR = 'variance'
Ancestors
 enum.Enum
Class variables
var COUNT
var MEAN
var PRIVACY_ID_COUNT
var SUM
var VAR
class NoiseKind (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.
Expand source code
class NoiseKind(Enum): LAPLACE = 'laplace' GAUSSIAN = 'gaussian' def convert_to_mechanism_type(self): if self.value == NoiseKind.LAPLACE.value: return MechanismType.LAPLACE elif self.value == NoiseKind.GAUSSIAN.value: return MechanismType.GAUSSIAN
Ancestors
 enum.Enum
Class variables
var GAUSSIAN
var LAPLACE
Methods
def convert_to_mechanism_type(self)

Expand source code
def convert_to_mechanism_type(self): if self.value == NoiseKind.LAPLACE.value: return MechanismType.LAPLACE elif self.value == NoiseKind.GAUSSIAN.value: return MechanismType.GAUSSIAN
class NormKind (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.
Expand source code
class NormKind(Enum): Linf = "linf" L0 = "l0" L1 = "l1" L2 = "l2"
Ancestors
 enum.Enum
Class variables
var L0
var L1
var L2
var Linf
class PrivacyIdCountParams (noise_kind: NoiseKind, max_partitions_contributed: int, partition_extractor: Callable, budget_weight: float = 1, public_partitions: Union[Iterable, ForwardRef('PCollection'), ForwardRef('RDD')] = None)

Specifies parameters for differentiallyprivate privacy id count calculation.
Args
noise_kind
 Kind of noise to use for the DP calculations.
max_partitions_contributed
 Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate.
budget_weight
 Relative weight of the privacy budget allocated for this operation.
partition_extractor
 A function for partition id extraction from a collection record.
public_partitions
 A collection of partition keys that will be present in the result.
Expand source code
@dataclass class PrivacyIdCountParams: """Specifies parameters for differentiallyprivate privacy id count calculation. Args: noise_kind: Kind of noise to use for the DP calculations. max_partitions_contributed: Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate. budget_weight: Relative weight of the privacy budget allocated for this operation. partition_extractor: A function for partition id extraction from a collection record. public_partitions: A collection of partition keys that will be present in the result. """ noise_kind: NoiseKind max_partitions_contributed: int partition_extractor: Callable budget_weight: float = 1 public_partitions: Union[Iterable, 'PCollection', 'RDD'] = None
Class variables
var budget_weight : float
var max_partitions_contributed : int
var noise_kind : NoiseKind
var partition_extractor : Callable
var public_partitions : Union[Iterable, PCollection, RDD]
class SelectPrivatePartitionsParams (max_partitions_contributed: int)

Specifies parameters for differentiallyprivate partition selection.
Args
max_partitions_contributed
 Maximum number of partitions per privacy ID. The algorithm will drop contributions over this limit. To keep more data, this should be a good estimate of the realistic upper bound. Significantly over or underestimating this may increase the amount of dropped partitions. You can experiment with different values to select which one retains more partitions.
Expand source code
@dataclass class SelectPrivatePartitionsParams: """Specifies parameters for differentiallyprivate partition selection. Args: max_partitions_contributed: Maximum number of partitions per privacy ID. The algorithm will drop contributions over this limit. To keep more data, this should be a good estimate of the realistic upper bound. Significantly over or underestimating this may increase the amount of dropped partitions. You can experiment with different values to select which one retains more partitions. """ max_partitions_contributed: int def __str__(self): return "Private Partitions"
Class variables
var max_partitions_contributed : int
class SumParams (max_partitions_contributed: int, max_contributions_per_partition: int, min_value: float, max_value: float, partition_extractor: Callable, value_extractor: Callable, low: float = None, high: float = None, budget_weight: float = 1, noise_kind: NoiseKind = NoiseKind.LAPLACE, public_partitions: Union[Iterable, ForwardRef('PCollection'), ForwardRef('RDD')] = None)

Specifies parameters for differentiallyprivate sum calculation.
Args
noise_kind
 Kind of noise to use for the DP calculations.
max_partitions_contributed
 Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate.
max_contributions_per_partition
 Bounds the number of times one unit of privacy (e.g. a user) can contribute to a partition.
low
 Lower bound on a value contributed by a unit of privacy in a partition.
high
 Upper bound on a value contributed by a unit of privacy in a partition.
public_partitions
 A collection of partition keys that will be present in the result.
partition_extractor
 A function for partition id extraction from a collection record.
value_extractor
 A function for extraction of value for which the sum will be calculated.
Expand source code
@dataclass class SumParams: """Specifies parameters for differentiallyprivate sum calculation. Args: noise_kind: Kind of noise to use for the DP calculations. max_partitions_contributed: Bounds the number of partitions in which one unit of privacy (e.g., a user) can participate. max_contributions_per_partition: Bounds the number of times one unit of privacy (e.g. a user) can contribute to a partition. low: Lower bound on a value contributed by a unit of privacy in a partition. high: Upper bound on a value contributed by a unit of privacy in a partition. public_partitions: A collection of partition keys that will be present in the result. partition_extractor: A function for partition id extraction from a collection record. value_extractor: A function for extraction of value for which the sum will be calculated. """ max_partitions_contributed: int max_contributions_per_partition: int min_value: float max_value: float partition_extractor: Callable value_extractor: Callable low: float = None # deprecated high: float = None # deprecated budget_weight: float = 1 noise_kind: NoiseKind = NoiseKind.LAPLACE public_partitions: Union[Iterable, 'PCollection', 'RDD'] = None def __post_init__(self): if self.low is not None: raise ValueError("SumParams: please use min_value instead of low") if self.high is not None: raise ValueError("SumParams: please use max_value instead of high")
Class variables
var budget_weight : float
var high : float
var low : float
var max_contributions_per_partition : int
var max_partitions_contributed : int
var max_value : float
var min_value : float
var noise_kind : NoiseKind
var partition_extractor : Callable
var public_partitions : Union[Iterable, PCollection, RDD]
var value_extractor : Callable