2 * Program: pgn-extract: a Portable Game Notation (PGN) extractor.
3 * Copyright (C) 1994-2014 David Barnes
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 1, or (at your option)
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 * David Barnes may be contacted as D.J.Barnes@kent.ac.uk
19 * http://www.cs.kent.ac.uk/people/staff/djb/
27 #if defined(__BORLANDC__) || defined(_MSC_VER)
43 /* Routines, similar in nature to those in apply.c
44 * to implement a duplicate hash-table lookup using
45 * an external file, rather than malloc'd memory.
46 * The only limit should be a file system limit.
48 * This version should be slightly more accurate than
49 * the alternative because the final_ and cumulative_
50 * hash values are both stored, rather than the XOR
54 /* The name of the file used.
55 * This is overwritten each time, and removed on normal
58 static char VIRTUAL_FILE[] = "virtual.tmp";
60 /* Define the size of the hash table.
61 * Size was 8191 but that seems unduly conservative these days.
63 #define LOG_TABLE_SIZE 100003
64 /* Define a table to hold hash values of the extracted games.
65 * This is used to enable duplicate detection.
68 /* Record the file offset of the first and last entries
69 * for an index. (head == -1) => empty.
74 /* If use_virtual_hash_table */
75 static LogHeaderEntry *VirtualLogTable = NULL;
77 /* Define a table to hold hash values of the extracted games.
78 * This is used to enable duplicate detection when not using
79 * the virtual hash table.
81 static HashLog **LogTable = NULL;
83 /* Define a type to hold hash values of interest.
84 * This is used both to aid in duplicate detection
85 * and in finding positional variations.
87 typedef struct VirtualHashLog {
88 /* Store the final position hash value and
89 * the cumulative hash value for a game.
91 HashCode final_hash_value, cumulative_hash_value;
92 /* Record the file list index for the file this game was first found in. */
94 /* Record the file offset of the next element
95 * in this list. -1 => end-of-list.
100 static FILE *hash_file = NULL;
102 static const char *previous_virtual_occurance(Game game_details);
104 /* Determine which table to initialise, depending
105 * on whether use_virtual_hash_table is set or not.
108 init_duplicate_hash_table(void)
111 if(GlobalState.use_virtual_hash_table){
112 VirtualLogTable = (LogHeaderEntry *)
113 MallocOrDie(LOG_TABLE_SIZE*sizeof(*VirtualLogTable));
114 for(i = 0; i < LOG_TABLE_SIZE; i++){
115 VirtualLogTable[i].head = VirtualLogTable[i].tail = -1;
117 hash_file = fopen(VIRTUAL_FILE,"w+b");
118 if(hash_file == NULL){
119 fprintf(GlobalState.logfile,"Unable to open %s\n",
124 LogTable = (HashLog**) MallocOrDie(LOG_TABLE_SIZE*sizeof(*LogTable));
125 for(i = 0; i < LOG_TABLE_SIZE; i++){
131 /* Close and remove the temporary file if in use. */
133 clear_duplicate_hash_table(void)
135 if(GlobalState.use_virtual_hash_table){
136 if(hash_file != NULL){
137 (void) fclose(hash_file);
138 unlink(VIRTUAL_FILE);
144 /* Retrieve a duplicate table entry from the hash file. */
146 retrieve_virtual_entry(long ix,VirtualHashLog *entry)
148 if(hash_file == NULL){
151 else if(fseek(hash_file,ix,SEEK_SET) != 0){
152 fprintf(GlobalState.logfile,
153 "Fseek error to %ld in retrieve_virtual_entry\n",ix);
156 else if(fread((void *)entry,sizeof(*entry),1,hash_file) != 1){
157 fprintf(GlobalState.logfile,
158 "Fread error from %ld in retrieve_virtual_entry\n",ix);
166 /* Write a duplicate table entry to the hash file. */
168 write_virtual_entry(long where,const VirtualHashLog *entry)
170 if(fseek(hash_file,where,SEEK_SET) != 0){
171 fprintf(GlobalState.logfile,
172 "Fseek error to %ld in write_virtual_entry\n",where);
175 else if(fwrite((void *)entry,sizeof(*entry),1,hash_file) != 1){
176 fprintf(GlobalState.logfile,
177 "Fwrite error from %ld in write_virtual_entry\n",where);
186 /* Return the name of the original file if it looks like we
187 * have met the moves in game_details before, otherwise return
188 * NULL. A match is assumed to be so if both
189 * the final_ and cumulative_ hash values in game_details
190 * are already present in VirtualLogTable.
193 previous_virtual_occurance(Game game_details)
194 { unsigned ix = game_details.final_hash_value % LOG_TABLE_SIZE;
195 VirtualHashLog entry;
196 Boolean duplicate = FALSE;
197 const char *original_filename = NULL;
200 /* Are we keeping this information? */
201 if(GlobalState.suppress_duplicates || GlobalState.suppress_originals ||
202 GlobalState.duplicate_file != NULL){
203 if(VirtualLogTable[ix].head < 0l){
204 /* First occurrence. */
208 retrieve_virtual_entry(VirtualLogTable[ix].head,&entry);
210 while(keep_going && !duplicate){
211 if((entry.final_hash_value == game_details.final_hash_value) &&
212 (entry.cumulative_hash_value == game_details.cumulative_hash_value)){
214 * Determine where it first occured.
216 original_filename = input_file_name(entry.file_number);
219 else if(entry.next >= 0l){
220 keep_going = retrieve_virtual_entry(entry.next,&entry);
229 /* Write an entry for it. */
230 /* Where to write the next VirtualHashLog entry. */
231 static long next_free_entry = 0l;
233 /* Store the XOR of the two hash values. */
234 entry.final_hash_value = game_details.final_hash_value ;
235 entry.cumulative_hash_value = game_details.cumulative_hash_value;
236 entry.file_number = current_file_number();
239 /* Write out these details. */
240 if(write_virtual_entry(next_free_entry,&entry)){
241 long where_written = next_free_entry;
242 /* Move on ready for next time. */
243 next_free_entry += sizeof(entry);
245 /* Now update the index table. */
246 if(VirtualLogTable[ix].head < 0l){
247 /* First occurrence. */
248 VirtualLogTable[ix].head =
249 VirtualLogTable[ix].tail = where_written;
254 if(retrieve_virtual_entry(VirtualLogTable[ix].tail,&tail)){
255 tail.next = where_written;
256 (void) write_virtual_entry(VirtualLogTable[ix].tail,&tail);
257 /* Store the new tail address. */
258 VirtualLogTable[ix].tail = where_written;
264 return original_filename;
267 /* Return the name of the original file if it looks like we
268 * have met the moves in game_details before, otherwise return
270 * For non-fuzzy comparision, a match is assumed to be so if both
271 * final_ and cumulative_ hash values are already present
272 * as a pair in LogTable.
273 * Fuzzy matches depend on the match depth and do not use the
274 * cumulative hash value.
277 previous_occurance(Game game_details, int plycount)
279 const char *original_filename = NULL;
280 if(GlobalState.use_virtual_hash_table){
281 original_filename = previous_virtual_occurance(game_details);
284 unsigned ix = game_details.final_hash_value % LOG_TABLE_SIZE;
285 HashLog *entry = NULL;
286 Boolean duplicate = FALSE;
288 /* Are we keeping this information? */
289 if(GlobalState.suppress_duplicates || GlobalState.suppress_originals ||
290 GlobalState.fuzzy_match_duplicates ||
291 GlobalState.duplicate_file != NULL){
292 for(entry = LogTable[ix]; !duplicate && (entry != NULL);
293 entry = entry->next){
294 if(entry->final_hash_value == game_details.final_hash_value &&
295 entry->cumulative_hash_value == game_details.cumulative_hash_value){
296 /* An exact match. */
299 else if(GlobalState.fuzzy_match_duplicates) {
300 if(GlobalState.fuzzy_match_depth == 0 &&
301 entry->final_hash_value == game_details.final_hash_value) {
302 /* Accept positional match at the end of the game. */
306 /* Need to check at the fuzzy_match_depth. */
307 if(entry->final_hash_value == game_details.fuzzy_duplicate_hash) {
314 * Determine where it first occured.
316 original_filename = input_file_name(entry->file_number);
321 /* First occurrence, so add it to the log. */
322 entry = (HashLog *) MallocOrDie(sizeof(*entry));
324 if(!GlobalState.fuzzy_match_duplicates) {
325 /* Store the two hash values. */
326 entry->final_hash_value = game_details.final_hash_value;
327 entry->cumulative_hash_value = game_details.cumulative_hash_value;
329 else if(GlobalState.fuzzy_match_depth > 0 &&
330 plycount >= GlobalState.fuzzy_match_depth) {
331 /* Store just the hash value from the fuzzy depth. */
332 entry->final_hash_value = game_details.fuzzy_duplicate_hash;
333 entry->cumulative_hash_value = 0;
336 /* Store the two hash values. */
337 entry->final_hash_value = game_details.final_hash_value;
338 entry->cumulative_hash_value = game_details.cumulative_hash_value;
340 entry->file_number = current_file_number();
341 /* Link it into the head at this index. */
342 entry->next = LogTable[ix];
343 LogTable[ix] = entry;
347 return original_filename;