#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import gc
import os
import sys
from tempfile import NamedTemporaryFile
import threading
import pickle
from typing import (
overload,
Any,
BinaryIO,
Callable,
Dict,
Generic,
IO,
Iterator,
Optional,
Tuple,
TypeVar,
TYPE_CHECKING,
Union,
)
from pyspark.java_gateway import local_connect_and_auth
from pyspark.serializers import ChunkedStream, pickle_protocol
from pyspark.util import print_exec
from pyspark.errors import PySparkRuntimeError
if TYPE_CHECKING:
from pyspark import SparkContext
__all__ = ["Broadcast"]
T = TypeVar("T")
# Holds broadcasted data received from Java, keyed by its id.
_broadcastRegistry: Dict[int, "Broadcast[Any]"] = {}
def _from_id(bid: int) -> "Broadcast[Any]":
from pyspark.broadcast import _broadcastRegistry
if bid not in _broadcastRegistry:
raise PySparkRuntimeError(
error_class="BROADCAST_VARIABLE_NOT_LOADED",
message_parameters={
"variable": str(bid),
},
)
return _broadcastRegistry[bid]
[docs]class Broadcast(Generic[T]):
"""
A broadcast variable created with :meth:`SparkContext.broadcast`.
Access its value through :attr:`value`.
Examples
--------
>>> b = spark.sparkContext.broadcast([1, 2, 3, 4, 5])
>>> b.value
[1, 2, 3, 4, 5]
>>> spark.sparkContext.parallelize([0, 0]).flatMap(lambda x: b.value).collect()
[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]
>>> b.unpersist()
>>> large_broadcast = spark.sparkContext.broadcast(range(10000))
"""
@overload # On driver
def __init__(
self: "Broadcast[T]",
sc: "SparkContext",
value: T,
pickle_registry: "BroadcastPickleRegistry",
):
...
@overload # On worker without decryption server
def __init__(self: "Broadcast[Any]", *, path: str):
...
@overload # On worker with decryption server
def __init__(self: "Broadcast[Any]", *, sock_file: str):
...
def __init__( # type: ignore[misc]
self,
sc: Optional["SparkContext"] = None,
value: Optional[T] = None,
pickle_registry: Optional["BroadcastPickleRegistry"] = None,
path: Optional[str] = None,
sock_file: Optional[BinaryIO] = None,
):
"""
Should not be called directly by users -- use :meth:`SparkContext.broadcast`
instead.
"""
if sc is not None:
# we're on the driver. We want the pickled data to end up in a file (maybe encrypted)
f = NamedTemporaryFile(delete=False, dir=sc._temp_dir)
self._path = f.name
self._sc: Optional["SparkContext"] = sc
assert sc._jvm is not None
self._python_broadcast = sc._jvm.PythonRDD.setupBroadcast(self._path)
broadcast_out: Union[ChunkedStream, IO[bytes]]
if sc._encryption_enabled:
# with encryption, we ask the jvm to do the encryption for us, we send it data
# over a socket
port, auth_secret = self._python_broadcast.setupEncryptionServer()
(encryption_sock_file, _) = local_connect_and_auth(port, auth_secret)
broadcast_out = ChunkedStream(encryption_sock_file, 8192)
else:
# no encryption, we can just write pickled data directly to the file from python
broadcast_out = f
self.dump(value, broadcast_out) # type: ignore[arg-type]
if sc._encryption_enabled:
self._python_broadcast.waitTillDataReceived()
self._jbroadcast = sc._jsc.broadcast(self._python_broadcast)
self._pickle_registry = pickle_registry
else:
# we're on an executor
self._jbroadcast = None
self._sc = None
self._python_broadcast = None
if sock_file is not None:
# the jvm is doing decryption for us. Read the value
# immediately from the sock_file
self._value = self.load(sock_file)
else:
# the jvm just dumps the pickled data in path -- we'll unpickle lazily when
# the value is requested
assert path is not None
self._path = path
[docs] def dump(self, value: T, f: BinaryIO) -> None:
"""
Write a pickled representation of value to the open file or socket.
The protocol pickle is HIGHEST_PROTOCOL.
Parameters
----------
value : T
Value to write.
f : :class:`BinaryIO`
File or socket where the pickled value will be stored.
Examples
--------
>>> import os
>>> import tempfile
>>> b = spark.sparkContext.broadcast([1, 2, 3, 4, 5])
Write a pickled representation of `b` to the open temp file.
>>> with tempfile.TemporaryDirectory() as d:
... path = os.path.join(d, "test.txt")
... with open(path, "wb") as f:
... b.dump(b.value, f)
"""
try:
pickle.dump(value, f, pickle_protocol)
except pickle.PickleError:
raise
except Exception as e:
msg = "Could not serialize broadcast: %s: %s" % (e.__class__.__name__, str(e))
print_exec(sys.stderr)
raise pickle.PicklingError(msg)
f.close()
[docs] def load_from_path(self, path: str) -> T:
"""
Read the pickled representation of an object from the open file and
return the reconstituted object hierarchy specified therein.
Parameters
----------
path : str
File path where reads the pickled value.
Returns
-------
T
The object hierarchy specified therein reconstituted
from the pickled representation of an object.
Examples
--------
>>> import os
>>> import tempfile
>>> b = spark.sparkContext.broadcast([1, 2, 3, 4, 5])
>>> c = spark.sparkContext.broadcast(1)
Read the pickled representation of value from temp file.
>>> with tempfile.TemporaryDirectory() as d:
... path = os.path.join(d, "test.txt")
... with open(path, "wb") as f:
... b.dump(b.value, f)
... c.load_from_path(path)
[1, 2, 3, 4, 5]
"""
with open(path, "rb", 1 << 20) as f:
return self.load(f)
[docs] def load(self, file: BinaryIO) -> T:
"""
Read a pickled representation of value from the open file or socket.
Parameters
----------
file : :class:`BinaryIO`
File or socket where the pickled value will be read.
Returns
-------
T
The object hierarchy specified therein reconstituted
from the pickled representation of an object.
Examples
--------
>>> import os
>>> import tempfile
>>> b = spark.sparkContext.broadcast([1, 2, 3, 4, 5])
>>> c = spark.sparkContext.broadcast(1)
Read the pickled representation of value from the open temp file.
>>> with tempfile.TemporaryDirectory() as d:
... path = os.path.join(d, "test.txt")
... with open(path, "wb") as f:
... b.dump(b.value, f)
... with open(path, "rb") as f:
... c.load(f)
[1, 2, 3, 4, 5]
"""
gc.disable()
try:
return pickle.load(file)
finally:
gc.enable()
@property
def value(self) -> T:
"""Return the broadcasted value"""
if not hasattr(self, "_value") and self._path is not None:
# we only need to decrypt it here when encryption is enabled and
# if its on the driver, since executor decryption is handled already
if self._sc is not None and self._sc._encryption_enabled:
port, auth_secret = self._python_broadcast.setupDecryptionServer()
(decrypted_sock_file, _) = local_connect_and_auth(port, auth_secret)
self._python_broadcast.waitTillBroadcastDataSent()
return self.load(decrypted_sock_file)
else:
self._value = self.load_from_path(self._path)
return self._value
[docs] def unpersist(self, blocking: bool = False) -> None:
"""
Delete cached copies of this broadcast on the executors. If the
broadcast is used after this is called, it will need to be
re-sent to each executor.
Parameters
----------
blocking : bool, optional, default False
Whether to block until unpersisting has completed.
Examples
--------
>>> b = spark.sparkContext.broadcast([1, 2, 3, 4, 5])
Delete cached copies of this broadcast on the executors
>>> b.unpersist()
"""
if self._jbroadcast is None:
raise PySparkRuntimeError(
error_class="INVALID_BROADCAST_OPERATION",
message_parameters={"operation": "unpersisted"},
)
self._jbroadcast.unpersist(blocking)
[docs] def destroy(self, blocking: bool = False) -> None:
"""
Destroy all data and metadata related to this broadcast variable.
Use this with caution; once a broadcast variable has been destroyed,
it cannot be used again.
.. versionchanged:: 3.0.0
Added optional argument `blocking` to specify whether to block until all
blocks are deleted.
Parameters
----------
blocking : bool, optional, default False
Whether to block until unpersisting has completed.
Examples
--------
>>> b = spark.sparkContext.broadcast([1, 2, 3, 4, 5])
Destroy all data and metadata related to this broadcast variable
>>> b.destroy()
"""
if self._jbroadcast is None:
raise PySparkRuntimeError(
error_class="INVALID_BROADCAST_OPERATION",
message_parameters={"operation": "destroyed"},
)
self._jbroadcast.destroy(blocking)
os.unlink(self._path)
def __reduce__(self) -> Tuple[Callable[[int], "Broadcast[T]"], Tuple[int]]:
if self._jbroadcast is None:
raise PySparkRuntimeError(
error_class="INVALID_BROADCAST_OPERATION",
message_parameters={"operation": "serialized"},
)
assert self._pickle_registry is not None
self._pickle_registry.add(self)
return _from_id, (self._jbroadcast.id(),)
class BroadcastPickleRegistry(threading.local):
"""Thread-local registry for broadcast variables that have been pickled"""
def __init__(self) -> None:
self.__dict__.setdefault("_registry", set())
def __iter__(self) -> Iterator[Broadcast[Any]]:
for bcast in self._registry:
yield bcast
def add(self, bcast: Broadcast[Any]) -> None:
self._registry.add(bcast)
def clear(self) -> None:
self._registry.clear()
def _test() -> None:
import doctest
from pyspark.sql import SparkSession
import pyspark.broadcast
globs = pyspark.broadcast.__dict__.copy()
spark = SparkSession.builder.master("local[4]").appName("broadcast tests").getOrCreate()
globs["spark"] = spark
(failure_count, test_count) = doctest.testmod(pyspark.broadcast, globs=globs)
spark.stop()
if failure_count:
sys.exit(-1)
if __name__ == "__main__":
_test()