설명 없음
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

hashtable-common.h 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. // Copyright (c) 2010, Google Inc.
  2. // All rights reserved.
  3. //
  4. // Redistribution and use in source and binary forms, with or without
  5. // modification, are permitted provided that the following conditions are
  6. // met:
  7. //
  8. // * Redistributions of source code must retain the above copyright
  9. // notice, this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above
  11. // copyright notice, this list of conditions and the following disclaimer
  12. // in the documentation and/or other materials provided with the
  13. // distribution.
  14. // * Neither the name of Google Inc. nor the names of its
  15. // contributors may be used to endorse or promote products derived from
  16. // this software without specific prior written permission.
  17. //
  18. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. // ---
  30. //
  31. // Provides classes shared by both sparse and dense hashtable.
  32. //
  33. // sh_hashtable_settings has parameters for growing and shrinking
  34. // a hashtable. It also packages zero-size functor (ie. hasher).
  35. //
  36. // Other functions and classes provide common code for serializing
  37. // and deserializing hashtables to a stream (such as a FILE*).
  38. #ifndef UTIL_GTL_HASHTABLE_COMMON_H_
  39. #define UTIL_GTL_HASHTABLE_COMMON_H_
  40. #include "sparseconfig.h"
  41. #include <assert.h>
  42. #include <stdio.h>
  43. #include <stddef.h> // for size_t
  44. #include <iosfwd>
  45. #include <stdexcept> // For length_error
  46. _START_GOOGLE_NAMESPACE_
  47. template <bool> struct SparsehashCompileAssert { };
  48. #define SPARSEHASH_COMPILE_ASSERT(expr, msg) \
  49. __attribute__((unused)) typedef SparsehashCompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
  50. namespace sparsehash_internal {
  51. // Adaptor methods for reading/writing data from an INPUT or OUPTUT
  52. // variable passed to serialize() or unserialize(). For now we
  53. // have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note
  54. // they are pointers, unlike typical use), or else a pointer to
  55. // something that supports a Read()/Write() method.
  56. //
  57. // For technical reasons, we implement read_data/write_data in two
  58. // stages. The actual work is done in *_data_internal, which takes
  59. // the stream argument twice: once as a template type, and once with
  60. // normal type information. (We only use the second version.) We do
  61. // this because of how C++ picks what function overload to use. If we
  62. // implemented this the naive way:
  63. // bool read_data(istream* is, const void* data, size_t length);
  64. // template<typename T> read_data(T* fp, const void* data, size_t length);
  65. // C++ would prefer the second version for every stream type except
  66. // istream. However, we want C++ to prefer the first version for
  67. // streams that are *subclasses* of istream, such as istringstream.
  68. // This is not possible given the way template types are resolved. So
  69. // we split the stream argument in two, one of which is templated and
  70. // one of which is not. The specialized functions (like the istream
  71. // version above) ignore the template arg and use the second, 'type'
  72. // arg, getting subclass matching as normal. The 'catch-all'
  73. // functions (the second version above) use the template arg to deduce
  74. // the type, and use a second, void* arg to achieve the desired
  75. // 'catch-all' semantics.
  76. // ----- low-level I/O for FILE* ----
  77. template<typename Ignored>
  78. inline bool read_data_internal(Ignored*, FILE* fp,
  79. void* data, size_t length) {
  80. return fread(data, length, 1, fp) == 1;
  81. }
  82. template<typename Ignored>
  83. inline bool write_data_internal(Ignored*, FILE* fp,
  84. const void* data, size_t length) {
  85. return fwrite(data, length, 1, fp) == 1;
  86. }
  87. // ----- low-level I/O for iostream ----
  88. // We want the caller to be responsible for #including <iostream>, not
  89. // us, because iostream is a big header! According to the standard,
  90. // it's only legal to delay the instantiation the way we want to if
  91. // the istream/ostream is a template type. So we jump through hoops.
  92. template<typename ISTREAM>
  93. inline bool read_data_internal_for_istream(ISTREAM* fp,
  94. void* data, size_t length) {
  95. return fp->read(reinterpret_cast<char*>(data), length).good();
  96. }
  97. template<typename Ignored>
  98. inline bool read_data_internal(Ignored*, std::istream* fp,
  99. void* data, size_t length) {
  100. return read_data_internal_for_istream(fp, data, length);
  101. }
  102. template<typename OSTREAM>
  103. inline bool write_data_internal_for_ostream(OSTREAM* fp,
  104. const void* data, size_t length) {
  105. return fp->write(reinterpret_cast<const char*>(data), length).good();
  106. }
  107. template<typename Ignored>
  108. inline bool write_data_internal(Ignored*, std::ostream* fp,
  109. const void* data, size_t length) {
  110. return write_data_internal_for_ostream(fp, data, length);
  111. }
  112. // ----- low-level I/O for custom streams ----
  113. // The INPUT type needs to support a Read() method that takes a
  114. // buffer and a length and returns the number of bytes read.
  115. template <typename INPUT>
  116. inline bool read_data_internal(INPUT* fp, void*,
  117. void* data, size_t length) {
  118. return static_cast<size_t>(fp->Read(data, length)) == length;
  119. }
  120. // The OUTPUT type needs to support a Write() operation that takes
  121. // a buffer and a length and returns the number of bytes written.
  122. template <typename OUTPUT>
  123. inline bool write_data_internal(OUTPUT* fp, void*,
  124. const void* data, size_t length) {
  125. return static_cast<size_t>(fp->Write(data, length)) == length;
  126. }
  127. // ----- low-level I/O: the public API ----
  128. template <typename INPUT>
  129. inline bool read_data(INPUT* fp, void* data, size_t length) {
  130. return read_data_internal(fp, fp, data, length);
  131. }
  132. template <typename OUTPUT>
  133. inline bool write_data(OUTPUT* fp, const void* data, size_t length) {
  134. return write_data_internal(fp, fp, data, length);
  135. }
  136. // Uses read_data() and write_data() to read/write an integer.
  137. // length is the number of bytes to read/write (which may differ
  138. // from sizeof(IntType), allowing us to save on a 32-bit system
  139. // and load on a 64-bit system). Excess bytes are taken to be 0.
  140. // INPUT and OUTPUT must match legal inputs to read/write_data (above).
  141. template <typename INPUT, typename IntType>
  142. bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) {
  143. *value = 0;
  144. unsigned char byte;
  145. // We require IntType to be unsigned or else the shifting gets all screwy.
  146. SPARSEHASH_COMPILE_ASSERT(static_cast<IntType>(-1) > static_cast<IntType>(0),
  147. serializing_int_requires_an_unsigned_type);
  148. for (size_t i = 0; i < length; ++i) {
  149. if (!read_data(fp, &byte, sizeof(byte))) return false;
  150. *value |= static_cast<IntType>(byte) << ((length - 1 - i) * 8);
  151. }
  152. return true;
  153. }
  154. template <typename OUTPUT, typename IntType>
  155. bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) {
  156. unsigned char byte;
  157. // We require IntType to be unsigned or else the shifting gets all screwy.
  158. SPARSEHASH_COMPILE_ASSERT(static_cast<IntType>(-1) > static_cast<IntType>(0),
  159. serializing_int_requires_an_unsigned_type);
  160. for (size_t i = 0; i < length; ++i) {
  161. byte = (sizeof(value) <= length-1 - i)
  162. ? 0 : static_cast<unsigned char>((value >> ((length-1 - i) * 8)) & 255);
  163. if (!write_data(fp, &byte, sizeof(byte))) return false;
  164. }
  165. return true;
  166. }
  167. // If your keys and values are simple enough, you can pass this
  168. // serializer to serialize()/unserialize(). "Simple enough" means
  169. // value_type is a POD type that contains no pointers. Note,
  170. // however, we don't try to normalize endianness.
  171. // This is the type used for NopointerSerializer.
  172. template <typename value_type> struct pod_serializer {
  173. template <typename INPUT>
  174. bool operator()(INPUT* fp, value_type* value) const {
  175. return read_data(fp, value, sizeof(*value));
  176. }
  177. template <typename OUTPUT>
  178. bool operator()(OUTPUT* fp, const value_type& value) const {
  179. return write_data(fp, &value, sizeof(value));
  180. }
  181. };
  182. // Settings contains parameters for growing and shrinking the table.
  183. // It also packages zero-size functor (ie. hasher).
  184. //
  185. // It does some munging of the hash value in cases where we think
  186. // (fear) the original hash function might not be very good. In
  187. // particular, the default hash of pointers is the identity hash,
  188. // so probably all the low bits are 0. We identify when we think
  189. // we're hashing a pointer, and chop off the low bits. Note this
  190. // isn't perfect: even when the key is a pointer, we can't tell
  191. // for sure that the hash is the identity hash. If it's not, this
  192. // is needless work (and possibly, though not likely, harmful).
  193. template<typename Key, typename HashFunc,
  194. typename SizeType, int HT_MIN_BUCKETS>
  195. class sh_hashtable_settings : public HashFunc {
  196. public:
  197. typedef Key key_type;
  198. typedef HashFunc hasher;
  199. typedef SizeType size_type;
  200. public:
  201. sh_hashtable_settings(const hasher& hf,
  202. const float ht_occupancy_flt,
  203. const float ht_empty_flt)
  204. : hasher(hf),
  205. enlarge_threshold_(0),
  206. shrink_threshold_(0),
  207. consider_shrink_(false),
  208. use_empty_(false),
  209. use_deleted_(false),
  210. num_ht_copies_(0) {
  211. set_enlarge_factor(ht_occupancy_flt);
  212. set_shrink_factor(ht_empty_flt);
  213. }
  214. size_type hash(const key_type& v) const {
  215. // We munge the hash value when we don't trust hasher::operator().
  216. return hash_munger<Key>::MungedHash(hasher::operator()(v));
  217. }
  218. float enlarge_factor() const {
  219. return enlarge_factor_;
  220. }
  221. void set_enlarge_factor(float f) {
  222. enlarge_factor_ = f;
  223. }
  224. float shrink_factor() const {
  225. return shrink_factor_;
  226. }
  227. void set_shrink_factor(float f) {
  228. shrink_factor_ = f;
  229. }
  230. size_type enlarge_threshold() const {
  231. return enlarge_threshold_;
  232. }
  233. void set_enlarge_threshold(size_type t) {
  234. enlarge_threshold_ = t;
  235. }
  236. size_type shrink_threshold() const {
  237. return shrink_threshold_;
  238. }
  239. void set_shrink_threshold(size_type t) {
  240. shrink_threshold_ = t;
  241. }
  242. size_type enlarge_size(size_type x) const {
  243. return static_cast<size_type>(x * enlarge_factor_);
  244. }
  245. size_type shrink_size(size_type x) const {
  246. return static_cast<size_type>(x * shrink_factor_);
  247. }
  248. bool consider_shrink() const {
  249. return consider_shrink_;
  250. }
  251. void set_consider_shrink(bool t) {
  252. consider_shrink_ = t;
  253. }
  254. bool use_empty() const {
  255. return use_empty_;
  256. }
  257. void set_use_empty(bool t) {
  258. use_empty_ = t;
  259. }
  260. bool use_deleted() const {
  261. return use_deleted_;
  262. }
  263. void set_use_deleted(bool t) {
  264. use_deleted_ = t;
  265. }
  266. size_type num_ht_copies() const {
  267. return static_cast<size_type>(num_ht_copies_);
  268. }
  269. void inc_num_ht_copies() {
  270. ++num_ht_copies_;
  271. }
  272. // Reset the enlarge and shrink thresholds
  273. void reset_thresholds(size_type num_buckets) {
  274. set_enlarge_threshold(enlarge_size(num_buckets));
  275. set_shrink_threshold(shrink_size(num_buckets));
  276. // whatever caused us to reset already considered
  277. set_consider_shrink(false);
  278. }
  279. // Caller is resposible for calling reset_threshold right after
  280. // set_resizing_parameters.
  281. void set_resizing_parameters(float shrink, float grow) {
  282. assert(shrink >= 0.0);
  283. assert(grow <= 1.0);
  284. if (shrink > grow/2.0f)
  285. shrink = grow / 2.0f; // otherwise we thrash hashtable size
  286. set_shrink_factor(shrink);
  287. set_enlarge_factor(grow);
  288. }
  289. // This is the smallest size a hashtable can be without being too crowded
  290. // If you like, you can give a min #buckets as well as a min #elts
  291. size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) {
  292. float enlarge = enlarge_factor();
  293. size_type sz = HT_MIN_BUCKETS; // min buckets allowed
  294. while ( sz < min_buckets_wanted ||
  295. num_elts >= static_cast<size_type>(sz * enlarge) ) {
  296. // This just prevents overflowing size_type, since sz can exceed
  297. // max_size() here.
  298. if (static_cast<size_type>(sz * 2) < sz) {
  299. assert(false && "resize overflow"); // protect against overflow
  300. }
  301. sz *= 2;
  302. }
  303. return sz;
  304. }
  305. private:
  306. template<class HashKey> class hash_munger {
  307. public:
  308. static size_t MungedHash(size_t hash) {
  309. return hash;
  310. }
  311. };
  312. // This matches when the hashtable key is a pointer.
  313. template<class HashKey> class hash_munger<HashKey*> {
  314. public:
  315. static size_t MungedHash(size_t hash) {
  316. // TODO(csilvers): consider rotating instead:
  317. // static const int shift = (sizeof(void *) == 4) ? 2 : 3;
  318. // return (hash << (sizeof(hash) * 8) - shift)) | (hash >> shift);
  319. // This matters if we ever change sparse/dense_hash_* to compare
  320. // hashes before comparing actual values. It's speedy on x86.
  321. return hash / sizeof(void*); // get rid of known-0 bits
  322. }
  323. };
  324. size_type enlarge_threshold_; // table.size() * enlarge_factor
  325. size_type shrink_threshold_; // table.size() * shrink_factor
  326. float enlarge_factor_; // how full before resize
  327. float shrink_factor_; // how empty before resize
  328. // consider_shrink=true if we should try to shrink before next insert
  329. bool consider_shrink_;
  330. bool use_empty_; // used only by densehashtable, not sparsehashtable
  331. bool use_deleted_; // false until delkey has been set
  332. // num_ht_copies is a counter incremented every Copy/Move
  333. unsigned int num_ht_copies_;
  334. };
  335. } // namespace sparsehash_internal
  336. #undef SPARSEHASH_COMPILE_ASSERT
  337. _END_GOOGLE_NAMESPACE_
  338. #endif // UTIL_GTL_HASHTABLE_COMMON_H_