Source file hash.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
(*
   This is the interface to the runtime support for [ppx_hash].

   The [ppx_hash] syntax extension supports: [@@deriving hash] and [%hash_fold: TYPE] and
   [%hash: TYPE]

   For type [t] a function [hash_fold_t] of type [Hash.state -> t -> Hash.state] is
   generated.

   The generated [hash_fold_<T>] function is compositional, following the structure of the
   type; allowing user overrides at every level. This is in contrast to ocaml's builtin
   polymorphic hashing [Hashtbl.hash] which ignores user overrides.

   The generator also provides a direct hash-function [hash] (named [hash_<T>] when <T> !=
   "t") of type: [t -> Hash.hash_value].

   The folding hash function can be accessed as [%hash_fold: TYPE]
   The direct hash function can be accessed as [%hash: TYPE]
*)

open! Import0
module Array = Array0
module Char = Char0
module Int = Int0
module List = List0
include Hash_intf

(** Builtin folding-style hash functions, abstracted over [Hash_intf.S] *)
module Folding (Hash : Hash_intf.S) :
  Hash_intf.Builtin_intf
  with type state = Hash.state
   and type hash_value = Hash.hash_value = struct
  type state = Hash.state
  type hash_value = Hash.hash_value
  type 'a folder = state -> 'a -> state

  let hash_fold_unit s () = s
  let hash_fold_int = Hash.fold_int
  let hash_fold_int64 = Hash.fold_int64
  let hash_fold_float = Hash.fold_float
  let hash_fold_string = Hash.fold_string
  let as_int f s x = hash_fold_int s (f x)

  (* This ignores the sign bit on 32-bit architectures, but it's unlikely to lead to
     frequent collisions (min_value colliding with 0 is the most likely one).  *)
  let hash_fold_int32 = as_int Stdlib.Int32.to_int
  let hash_fold_char = as_int Char.to_int

  let hash_fold_bool =
    as_int (function
      | true -> 1
      | false -> 0)
  ;;

  let hash_fold_nativeint s x = hash_fold_int64 s (Stdlib.Int64.of_nativeint x)

  let hash_fold_option hash_fold_elem s = function
    | None -> hash_fold_int s 0
    | Some x -> hash_fold_elem (hash_fold_int s 1) x
  ;;

  let rec hash_fold_list_body hash_fold_elem s list =
    match list with
    | [] -> s
    | x :: xs -> hash_fold_list_body hash_fold_elem (hash_fold_elem s x) xs
  ;;

  let hash_fold_list hash_fold_elem s list =
    (* The [length] of the list must be incorporated into the hash-state so values of
       types such as [unit list] - ([], [()], [();()],..) are hashed differently. *)
    (* The [length] must come before the elements to avoid a violation of the rule
       enforced by Perfect_hash. *)
    let s = hash_fold_int s (List.length list) in
    let s = hash_fold_list_body hash_fold_elem s list in
    s
  ;;

  let hash_fold_lazy_t hash_fold_elem s x = hash_fold_elem s (Stdlib.Lazy.force x)
  let hash_fold_ref_frozen hash_fold_elem s x = hash_fold_elem s !x

  let rec hash_fold_array_frozen_i hash_fold_elem s array i =
    if i = Array.length array
    then s
    else (
      let e = Array.unsafe_get array i in
      hash_fold_array_frozen_i hash_fold_elem (hash_fold_elem s e) array (i + 1))
  ;;

  let hash_fold_array_frozen hash_fold_elem s array =
    hash_fold_array_frozen_i
      (* [length] must be incorporated for arrays, as it is for lists. See comment above *)
      hash_fold_elem
      (hash_fold_int s (Array.length array))
      array
      0
  ;;

  (* the duplication here is because we think
     ocaml can't eliminate indirect function calls otherwise. *)
  let hash_nativeint x =
    Hash.get_hash_value (hash_fold_nativeint (Hash.reset (Hash.alloc ())) x)
  ;;

  let hash_int64 x = Hash.get_hash_value (hash_fold_int64 (Hash.reset (Hash.alloc ())) x)
  let hash_int32 x = Hash.get_hash_value (hash_fold_int32 (Hash.reset (Hash.alloc ())) x)
  let hash_char x = Hash.get_hash_value (hash_fold_char (Hash.reset (Hash.alloc ())) x)
  let hash_int x = Hash.get_hash_value (hash_fold_int (Hash.reset (Hash.alloc ())) x)
  let hash_bool x = Hash.get_hash_value (hash_fold_bool (Hash.reset (Hash.alloc ())) x)

  let hash_string x =
    Hash.get_hash_value (hash_fold_string (Hash.reset (Hash.alloc ())) x)
  ;;

  let hash_float x = Hash.get_hash_value (hash_fold_float (Hash.reset (Hash.alloc ())) x)
  let hash_unit x = Hash.get_hash_value (hash_fold_unit (Hash.reset (Hash.alloc ())) x)
end

module F (Hash : Hash_intf.S) :
  Hash_intf.Full
  with type hash_value = Hash.hash_value
   and type state = Hash.state
   and type seed = Hash.seed = struct
  include Hash

  type 'a folder = state -> 'a -> state

  let create ?seed () = reset ?seed (alloc ())
  let of_fold hash_fold_t t = get_hash_value (hash_fold_t (create ()) t)

  module Builtin = Folding (Hash)

  let run ?seed folder x =
    Hash.get_hash_value (folder (Hash.reset ?seed (Hash.alloc ())) x)
  ;;
end

module Internalhash : sig
  include
    Hash_intf.S
    with type state = Base_internalhash_types.state
     (* We give a concrete type for [state], albeit only partially exposed (see
        Base_internalhash_types), so that it unifies with the same type in [Base_boot],
        and to allow optimizations for the immediate type. *)
     and type seed = Base_internalhash_types.seed
     and type hash_value = Base_internalhash_types.hash_value

  external fold_int64 : state -> int64 -> state = "Base_internalhash_fold_int64"
  [@@noalloc]

  external fold_int : state -> int -> state = "Base_internalhash_fold_int" [@@noalloc]

  external fold_float : state -> float -> state = "Base_internalhash_fold_float"
  [@@noalloc]

  external fold_string : state -> string -> state = "Base_internalhash_fold_string"
  [@@noalloc]

  external get_hash_value : state -> hash_value = "Base_internalhash_get_hash_value"
  [@@noalloc]
end = struct
  let description = "internalhash"

  include Base_internalhash_types

  let alloc () = create_seeded 0
  let reset ?(seed = 0) _t = create_seeded seed

  module For_tests = struct
    let compare_state (a : state) (b : state) = compare (a :> int) (b :> int)
    let state_to_string (state : state) = Int.to_string (state :> int)
  end
end

module T = struct
  include Internalhash

  type 'a folder = state -> 'a -> state

  let create ?seed () = reset ?seed (alloc ())
  let run ?seed folder x = get_hash_value (folder (reset ?seed (alloc ())) x)
  let of_fold hash_fold_t t = get_hash_value (hash_fold_t (create ()) t)

  module Builtin = struct
    module Folding = Folding (Internalhash)
    include Folding

    (* [Folding] provides some default implementations for the [hash_*] functions below,
       but they are inefficient for some use-cases because of the use of the [hash_fold]
       functions. At this point, the [hash_value] type has been fixed to [int], so this
       module can provide specialized implementations. *)

    let hash_char = Char0.to_int

    (* This hash was chosen from here: https://gist.github.com/badboy/6267743

       It attempts to fulfill the primary goals of a non-cryptographic hash function:

       - a bit change in the input should change ~1/2 of the output bits
       - the output should be uniformly distributed across the output range
       - inputs that are close to each other shouldn't lead to outputs that are close to
         each other.
       - all bits of the input are used in generating the output

       In our case we also want it to be fast, non-allocating, and inlinable.  *)
    let[@inline always] hash_int (t : int) =
      let t = lnot t + (t lsl 21) in
      let t = t lxor (t lsr 24) in
      let t = t + (t lsl 3) + (t lsl 8) in
      let t = t lxor (t lsr 14) in
      let t = t + (t lsl 2) + (t lsl 4) in
      let t = t lxor (t lsr 28) in
      t + (t lsl 31)
    ;;

    let hash_bool x = if x then 1 else 0

    external hash_float : float -> int = "Base_hash_double" [@@noalloc]

    let hash_unit () = 0
  end
end

include T