FeatureEnVi: Visual Analytics for Feature Engineering Using Stepwise Selection and Semi-Automatic Extraction Approaches
https://doi.org/10.1109/TVCG.2022.3141040
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
633 lines
25 KiB
633 lines
25 KiB
# Copyright 2014-present MongoDB, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you
|
|
# may not use this file except in compliance with the License. You
|
|
# may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied. See the License for the specific language governing
|
|
# permissions and limitations under the License.
|
|
|
|
"""Internal class to monitor a topology of one or more servers."""
|
|
|
|
import os
|
|
import random
|
|
import threading
|
|
import warnings
|
|
import weakref
|
|
|
|
from bson.py3compat import itervalues, PY3
|
|
if PY3:
|
|
import queue as Queue
|
|
else:
|
|
import Queue
|
|
|
|
from pymongo import common
|
|
from pymongo import periodic_executor
|
|
from pymongo.pool import PoolOptions
|
|
from pymongo.topology_description import (updated_topology_description,
|
|
SERVER_TYPE,
|
|
TOPOLOGY_TYPE,
|
|
TopologyDescription)
|
|
from pymongo.errors import ServerSelectionTimeoutError, ConfigurationError
|
|
from pymongo.monotonic import time as _time
|
|
from pymongo.server import Server
|
|
from pymongo.server_selectors import (any_server_selector,
|
|
arbiter_server_selector,
|
|
secondary_server_selector,
|
|
readable_server_selector,
|
|
writable_server_selector,
|
|
Selection)
|
|
from pymongo.client_session import _ServerSessionPool
|
|
|
|
|
|
def process_events_queue(queue_ref):
|
|
q = queue_ref()
|
|
if not q:
|
|
return False # Cancel PeriodicExecutor.
|
|
|
|
while True:
|
|
try:
|
|
event = q.get_nowait()
|
|
except Queue.Empty:
|
|
break
|
|
else:
|
|
fn, args = event
|
|
fn(*args)
|
|
|
|
return True # Continue PeriodicExecutor.
|
|
|
|
|
|
class Topology(object):
|
|
"""Monitor a topology of one or more servers."""
|
|
def __init__(self, topology_settings):
|
|
self._topology_id = topology_settings._topology_id
|
|
self._listeners = topology_settings._pool_options.event_listeners
|
|
pub = self._listeners is not None
|
|
self._publish_server = pub and self._listeners.enabled_for_server
|
|
self._publish_tp = pub and self._listeners.enabled_for_topology
|
|
|
|
# Create events queue if there are publishers.
|
|
self._events = None
|
|
self._events_thread = None
|
|
|
|
if self._publish_server or self._publish_tp:
|
|
self._events = Queue.Queue(maxsize=100)
|
|
|
|
if self._publish_tp:
|
|
self._events.put((self._listeners.publish_topology_opened,
|
|
(self._topology_id,)))
|
|
self._settings = topology_settings
|
|
topology_description = TopologyDescription(
|
|
topology_settings.get_topology_type(),
|
|
topology_settings.get_server_descriptions(),
|
|
topology_settings.replica_set_name,
|
|
None,
|
|
None,
|
|
topology_settings)
|
|
|
|
self._description = topology_description
|
|
if self._publish_tp:
|
|
initial_td = TopologyDescription(TOPOLOGY_TYPE.Unknown, {}, None,
|
|
None, None, self._settings)
|
|
self._events.put((
|
|
self._listeners.publish_topology_description_changed,
|
|
(initial_td, self._description, self._topology_id)))
|
|
|
|
for seed in topology_settings.seeds:
|
|
if self._publish_server:
|
|
self._events.put((self._listeners.publish_server_opened,
|
|
(seed, self._topology_id)))
|
|
|
|
# Store the seed list to help diagnose errors in _error_message().
|
|
self._seed_addresses = list(topology_description.server_descriptions())
|
|
self._opened = False
|
|
self._lock = threading.Lock()
|
|
self._condition = self._settings.condition_class(self._lock)
|
|
self._servers = {}
|
|
self._pid = None
|
|
self._max_cluster_time = None
|
|
self._session_pool = _ServerSessionPool()
|
|
|
|
if self._publish_server or self._publish_tp:
|
|
def target():
|
|
return process_events_queue(weak)
|
|
|
|
executor = periodic_executor.PeriodicExecutor(
|
|
interval=common.EVENTS_QUEUE_FREQUENCY,
|
|
min_interval=0.5,
|
|
target=target,
|
|
name="pymongo_events_thread")
|
|
|
|
# We strongly reference the executor and it weakly references
|
|
# the queue via this closure. When the topology is freed, stop
|
|
# the executor soon.
|
|
weak = weakref.ref(self._events)
|
|
self.__events_executor = executor
|
|
executor.open()
|
|
|
|
def open(self):
|
|
"""Start monitoring, or restart after a fork.
|
|
|
|
No effect if called multiple times.
|
|
|
|
.. warning:: Topology is shared among multiple threads and is protected
|
|
by mutual exclusion. Using Topology from a process other than the one
|
|
that initialized it will emit a warning and may result in deadlock. To
|
|
prevent this from happening, MongoClient must be created after any
|
|
forking.
|
|
|
|
"""
|
|
if self._pid is None:
|
|
self._pid = os.getpid()
|
|
else:
|
|
if os.getpid() != self._pid:
|
|
warnings.warn(
|
|
"MongoClient opened before fork. Create MongoClient only "
|
|
"after forking. See PyMongo's documentation for details: "
|
|
"http://api.mongodb.org/python/current/faq.html#"
|
|
"is-pymongo-fork-safe")
|
|
|
|
with self._lock:
|
|
self._ensure_opened()
|
|
|
|
def select_servers(self,
|
|
selector,
|
|
server_selection_timeout=None,
|
|
address=None):
|
|
"""Return a list of Servers matching selector, or time out.
|
|
|
|
:Parameters:
|
|
- `selector`: function that takes a list of Servers and returns
|
|
a subset of them.
|
|
- `server_selection_timeout` (optional): maximum seconds to wait.
|
|
If not provided, the default value common.SERVER_SELECTION_TIMEOUT
|
|
is used.
|
|
- `address`: optional server address to select.
|
|
|
|
Calls self.open() if needed.
|
|
|
|
Raises exc:`ServerSelectionTimeoutError` after
|
|
`server_selection_timeout` if no matching servers are found.
|
|
"""
|
|
if server_selection_timeout is None:
|
|
server_timeout = self._settings.server_selection_timeout
|
|
else:
|
|
server_timeout = server_selection_timeout
|
|
|
|
with self._lock:
|
|
server_descriptions = self._select_servers_loop(
|
|
selector, server_timeout, address)
|
|
|
|
return [self.get_server_by_address(sd.address)
|
|
for sd in server_descriptions]
|
|
|
|
def _select_servers_loop(self, selector, timeout, address):
|
|
"""select_servers() guts. Hold the lock when calling this."""
|
|
now = _time()
|
|
end_time = now + timeout
|
|
server_descriptions = self._description.apply_selector(
|
|
selector, address, custom_selector=self._settings.server_selector)
|
|
|
|
while not server_descriptions:
|
|
# No suitable servers.
|
|
if timeout == 0 or now > end_time:
|
|
raise ServerSelectionTimeoutError(
|
|
self._error_message(selector))
|
|
|
|
self._ensure_opened()
|
|
self._request_check_all()
|
|
|
|
# Release the lock and wait for the topology description to
|
|
# change, or for a timeout. We won't miss any changes that
|
|
# came after our most recent apply_selector call, since we've
|
|
# held the lock until now.
|
|
self._condition.wait(common.MIN_HEARTBEAT_INTERVAL)
|
|
self._description.check_compatible()
|
|
now = _time()
|
|
server_descriptions = self._description.apply_selector(
|
|
selector, address,
|
|
custom_selector=self._settings.server_selector)
|
|
|
|
self._description.check_compatible()
|
|
return server_descriptions
|
|
|
|
def select_server(self,
|
|
selector,
|
|
server_selection_timeout=None,
|
|
address=None):
|
|
"""Like select_servers, but choose a random server if several match."""
|
|
return random.choice(self.select_servers(selector,
|
|
server_selection_timeout,
|
|
address))
|
|
|
|
def select_server_by_address(self, address,
|
|
server_selection_timeout=None):
|
|
"""Return a Server for "address", reconnecting if necessary.
|
|
|
|
If the server's type is not known, request an immediate check of all
|
|
servers. Time out after "server_selection_timeout" if the server
|
|
cannot be reached.
|
|
|
|
:Parameters:
|
|
- `address`: A (host, port) pair.
|
|
- `server_selection_timeout` (optional): maximum seconds to wait.
|
|
If not provided, the default value
|
|
common.SERVER_SELECTION_TIMEOUT is used.
|
|
|
|
Calls self.open() if needed.
|
|
|
|
Raises exc:`ServerSelectionTimeoutError` after
|
|
`server_selection_timeout` if no matching servers are found.
|
|
"""
|
|
return self.select_server(any_server_selector,
|
|
server_selection_timeout,
|
|
address)
|
|
|
|
def _process_change(self, server_description):
|
|
"""Process a new ServerDescription on an opened topology.
|
|
|
|
Hold the lock when calling this.
|
|
"""
|
|
td_old = self._description
|
|
if self._publish_server:
|
|
old_server_description = td_old._server_descriptions[
|
|
server_description.address]
|
|
self._events.put((
|
|
self._listeners.publish_server_description_changed,
|
|
(old_server_description, server_description,
|
|
server_description.address, self._topology_id)))
|
|
|
|
self._description = updated_topology_description(
|
|
self._description, server_description)
|
|
|
|
self._update_servers()
|
|
self._receive_cluster_time_no_lock(server_description.cluster_time)
|
|
|
|
if self._publish_tp:
|
|
self._events.put((
|
|
self._listeners.publish_topology_description_changed,
|
|
(td_old, self._description, self._topology_id)))
|
|
|
|
# Wake waiters in select_servers().
|
|
self._condition.notify_all()
|
|
|
|
def on_change(self, server_description):
|
|
"""Process a new ServerDescription after an ismaster call completes."""
|
|
# We do no I/O holding the lock.
|
|
with self._lock:
|
|
# Monitors may continue working on ismaster calls for some time
|
|
# after a call to Topology.close, so this method may be called at
|
|
# any time. Ensure the topology is open before processing the
|
|
# change.
|
|
# Any monitored server was definitely in the topology description
|
|
# once. Check if it's still in the description or if some state-
|
|
# change removed it. E.g., we got a host list from the primary
|
|
# that didn't include this server.
|
|
if (self._opened and
|
|
self._description.has_server(server_description.address)):
|
|
self._process_change(server_description)
|
|
|
|
def get_server_by_address(self, address):
|
|
"""Get a Server or None.
|
|
|
|
Returns the current version of the server immediately, even if it's
|
|
Unknown or absent from the topology. Only use this in unittests.
|
|
In driver code, use select_server_by_address, since then you're
|
|
assured a recent view of the server's type and wire protocol version.
|
|
"""
|
|
return self._servers.get(address)
|
|
|
|
def has_server(self, address):
|
|
return address in self._servers
|
|
|
|
def get_primary(self):
|
|
"""Return primary's address or None."""
|
|
# Implemented here in Topology instead of MongoClient, so it can lock.
|
|
with self._lock:
|
|
topology_type = self._description.topology_type
|
|
if topology_type != TOPOLOGY_TYPE.ReplicaSetWithPrimary:
|
|
return None
|
|
|
|
return writable_server_selector(self._new_selection())[0].address
|
|
|
|
def _get_replica_set_members(self, selector):
|
|
"""Return set of replica set member addresses."""
|
|
# Implemented here in Topology instead of MongoClient, so it can lock.
|
|
with self._lock:
|
|
topology_type = self._description.topology_type
|
|
if topology_type not in (TOPOLOGY_TYPE.ReplicaSetWithPrimary,
|
|
TOPOLOGY_TYPE.ReplicaSetNoPrimary):
|
|
return set()
|
|
|
|
return set([sd.address for sd in selector(self._new_selection())])
|
|
|
|
def get_secondaries(self):
|
|
"""Return set of secondary addresses."""
|
|
return self._get_replica_set_members(secondary_server_selector)
|
|
|
|
def get_arbiters(self):
|
|
"""Return set of arbiter addresses."""
|
|
return self._get_replica_set_members(arbiter_server_selector)
|
|
|
|
def max_cluster_time(self):
|
|
"""Return a document, the highest seen $clusterTime."""
|
|
return self._max_cluster_time
|
|
|
|
def _receive_cluster_time_no_lock(self, cluster_time):
|
|
# Driver Sessions Spec: "Whenever a driver receives a cluster time from
|
|
# a server it MUST compare it to the current highest seen cluster time
|
|
# for the deployment. If the new cluster time is higher than the
|
|
# highest seen cluster time it MUST become the new highest seen cluster
|
|
# time. Two cluster times are compared using only the BsonTimestamp
|
|
# value of the clusterTime embedded field."
|
|
if cluster_time:
|
|
# ">" uses bson.timestamp.Timestamp's comparison operator.
|
|
if (not self._max_cluster_time
|
|
or cluster_time['clusterTime'] >
|
|
self._max_cluster_time['clusterTime']):
|
|
self._max_cluster_time = cluster_time
|
|
|
|
def receive_cluster_time(self, cluster_time):
|
|
with self._lock:
|
|
self._receive_cluster_time_no_lock(cluster_time)
|
|
|
|
def request_check_all(self, wait_time=5):
|
|
"""Wake all monitors, wait for at least one to check its server."""
|
|
with self._lock:
|
|
self._request_check_all()
|
|
self._condition.wait(wait_time)
|
|
|
|
def reset_pool(self, address):
|
|
with self._lock:
|
|
server = self._servers.get(address)
|
|
if server:
|
|
server.pool.reset()
|
|
|
|
def reset_server(self, address):
|
|
"""Clear our pool for a server and mark it Unknown.
|
|
|
|
Do *not* request an immediate check.
|
|
"""
|
|
with self._lock:
|
|
self._reset_server(address)
|
|
|
|
def reset_server_and_request_check(self, address):
|
|
"""Clear our pool for a server, mark it Unknown, and check it soon."""
|
|
with self._lock:
|
|
self._reset_server(address)
|
|
self._request_check(address)
|
|
|
|
def update_pool(self):
|
|
# Remove any stale sockets and add new sockets if pool is too small.
|
|
with self._lock:
|
|
for server in self._servers.values():
|
|
server._pool.remove_stale_sockets()
|
|
|
|
def close(self):
|
|
"""Clear pools and terminate monitors. Topology reopens on demand."""
|
|
with self._lock:
|
|
for server in self._servers.values():
|
|
server.close()
|
|
|
|
# Mark all servers Unknown.
|
|
self._description = self._description.reset()
|
|
self._update_servers()
|
|
self._opened = False
|
|
|
|
# Publish only after releasing the lock.
|
|
if self._publish_tp:
|
|
self._events.put((self._listeners.publish_topology_closed,
|
|
(self._topology_id,)))
|
|
if self._publish_server or self._publish_tp:
|
|
self.__events_executor.close()
|
|
|
|
@property
|
|
def description(self):
|
|
return self._description
|
|
|
|
def pop_all_sessions(self):
|
|
"""Pop all session ids from the pool."""
|
|
with self._lock:
|
|
return self._session_pool.pop_all()
|
|
|
|
def get_server_session(self):
|
|
"""Start or resume a server session, or raise ConfigurationError."""
|
|
with self._lock:
|
|
session_timeout = self._description.logical_session_timeout_minutes
|
|
if session_timeout is None:
|
|
# Maybe we need an initial scan? Can raise ServerSelectionError.
|
|
if self._description.topology_type == TOPOLOGY_TYPE.Single:
|
|
if not self._description.has_known_servers:
|
|
self._select_servers_loop(
|
|
any_server_selector,
|
|
self._settings.server_selection_timeout,
|
|
None)
|
|
elif not self._description.readable_servers:
|
|
self._select_servers_loop(
|
|
readable_server_selector,
|
|
self._settings.server_selection_timeout,
|
|
None)
|
|
|
|
session_timeout = self._description.logical_session_timeout_minutes
|
|
if session_timeout is None:
|
|
raise ConfigurationError(
|
|
"Sessions are not supported by this MongoDB deployment")
|
|
|
|
return self._session_pool.get_server_session(session_timeout)
|
|
|
|
def return_server_session(self, server_session, lock):
|
|
if lock:
|
|
with self._lock:
|
|
session_timeout = \
|
|
self._description.logical_session_timeout_minutes
|
|
if session_timeout is not None:
|
|
self._session_pool.return_server_session(server_session,
|
|
session_timeout)
|
|
else:
|
|
# Called from a __del__ method, can't use a lock.
|
|
self._session_pool.return_server_session_no_lock(server_session)
|
|
|
|
def is_mongos_non_blocking(self):
|
|
"""Return if we are connected to a Mongos without blocking.
|
|
|
|
If the state is unknown, return False.
|
|
"""
|
|
with self._lock:
|
|
if not self._opened:
|
|
return False
|
|
if self._description.topology_type == TOPOLOGY_TYPE.Sharded:
|
|
return True
|
|
server_descriptions = self._description.apply_selector(
|
|
writable_server_selector, None)
|
|
if not server_descriptions:
|
|
return False
|
|
return server_descriptions[0].server_type == SERVER_TYPE.Mongos
|
|
|
|
def _new_selection(self):
|
|
"""A Selection object, initially including all known servers.
|
|
|
|
Hold the lock when calling this.
|
|
"""
|
|
return Selection.from_topology_description(self._description)
|
|
|
|
def _ensure_opened(self):
|
|
"""Start monitors, or restart after a fork.
|
|
|
|
Hold the lock when calling this.
|
|
"""
|
|
if not self._opened:
|
|
self._opened = True
|
|
self._update_servers()
|
|
|
|
# Start or restart the events publishing thread.
|
|
if self._publish_tp or self._publish_server:
|
|
self.__events_executor.open()
|
|
|
|
# Ensure that the monitors are open.
|
|
for server in itervalues(self._servers):
|
|
server.open()
|
|
|
|
def _reset_server(self, address):
|
|
"""Clear our pool for a server and mark it Unknown.
|
|
|
|
Hold the lock when calling this. Does *not* request an immediate check.
|
|
"""
|
|
server = self._servers.get(address)
|
|
|
|
# "server" is None if another thread removed it from the topology.
|
|
if server:
|
|
server.reset()
|
|
|
|
# Mark this server Unknown.
|
|
self._description = self._description.reset_server(address)
|
|
self._update_servers()
|
|
|
|
def _request_check(self, address):
|
|
"""Wake one monitor. Hold the lock when calling this."""
|
|
server = self._servers.get(address)
|
|
|
|
# "server" is None if another thread removed it from the topology.
|
|
if server:
|
|
server.request_check()
|
|
|
|
def _request_check_all(self):
|
|
"""Wake all monitors. Hold the lock when calling this."""
|
|
for server in self._servers.values():
|
|
server.request_check()
|
|
|
|
def _update_servers(self):
|
|
"""Sync our Servers from TopologyDescription.server_descriptions.
|
|
|
|
Hold the lock while calling this.
|
|
"""
|
|
for address, sd in self._description.server_descriptions().items():
|
|
if address not in self._servers:
|
|
monitor = self._settings.monitor_class(
|
|
server_description=sd,
|
|
topology=self,
|
|
pool=self._create_pool_for_monitor(address),
|
|
topology_settings=self._settings)
|
|
|
|
weak = None
|
|
if self._publish_server:
|
|
weak = weakref.ref(self._events)
|
|
server = Server(
|
|
server_description=sd,
|
|
pool=self._create_pool_for_server(address),
|
|
monitor=monitor,
|
|
topology_id=self._topology_id,
|
|
listeners=self._listeners,
|
|
events=weak)
|
|
|
|
self._servers[address] = server
|
|
server.open()
|
|
else:
|
|
self._servers[address].description = sd
|
|
|
|
for address, server in list(self._servers.items()):
|
|
if not self._description.has_server(address):
|
|
server.close()
|
|
self._servers.pop(address)
|
|
|
|
def _create_pool_for_server(self, address):
|
|
return self._settings.pool_class(address, self._settings.pool_options)
|
|
|
|
def _create_pool_for_monitor(self, address):
|
|
options = self._settings.pool_options
|
|
|
|
# According to the Server Discovery And Monitoring Spec, monitors use
|
|
# connect_timeout for both connect_timeout and socket_timeout. The
|
|
# pool only has one socket so maxPoolSize and so on aren't needed.
|
|
monitor_pool_options = PoolOptions(
|
|
connect_timeout=options.connect_timeout,
|
|
socket_timeout=options.connect_timeout,
|
|
ssl_context=options.ssl_context,
|
|
ssl_match_hostname=options.ssl_match_hostname,
|
|
event_listeners=options.event_listeners,
|
|
appname=options.appname,
|
|
driver=options.driver)
|
|
|
|
return self._settings.pool_class(address, monitor_pool_options,
|
|
handshake=False)
|
|
|
|
def _error_message(self, selector):
|
|
"""Format an error message if server selection fails.
|
|
|
|
Hold the lock when calling this.
|
|
"""
|
|
is_replica_set = self._description.topology_type in (
|
|
TOPOLOGY_TYPE.ReplicaSetWithPrimary,
|
|
TOPOLOGY_TYPE.ReplicaSetNoPrimary)
|
|
|
|
if is_replica_set:
|
|
server_plural = 'replica set members'
|
|
elif self._description.topology_type == TOPOLOGY_TYPE.Sharded:
|
|
server_plural = 'mongoses'
|
|
else:
|
|
server_plural = 'servers'
|
|
|
|
if self._description.known_servers:
|
|
# We've connected, but no servers match the selector.
|
|
if selector is writable_server_selector:
|
|
if is_replica_set:
|
|
return 'No primary available for writes'
|
|
else:
|
|
return 'No %s available for writes' % server_plural
|
|
else:
|
|
return 'No %s match selector "%s"' % (server_plural, selector)
|
|
else:
|
|
addresses = list(self._description.server_descriptions())
|
|
servers = list(self._description.server_descriptions().values())
|
|
if not servers:
|
|
if is_replica_set:
|
|
# We removed all servers because of the wrong setName?
|
|
return 'No %s available for replica set name "%s"' % (
|
|
server_plural, self._settings.replica_set_name)
|
|
else:
|
|
return 'No %s available' % server_plural
|
|
|
|
# 1 or more servers, all Unknown. Are they unknown for one reason?
|
|
error = servers[0].error
|
|
same = all(server.error == error for server in servers[1:])
|
|
if same:
|
|
if error is None:
|
|
# We're still discovering.
|
|
return 'No %s found yet' % server_plural
|
|
|
|
if (is_replica_set and not
|
|
set(addresses).intersection(self._seed_addresses)):
|
|
# We replaced our seeds with new hosts but can't reach any.
|
|
return (
|
|
'Could not reach any servers in %s. Replica set is'
|
|
' configured with internal hostnames or IPs?' %
|
|
addresses)
|
|
|
|
return str(error)
|
|
else:
|
|
return ','.join(str(server.error) for server in servers
|
|
if server.error)
|
|
|