From f7441292bc1d6a42b3198a60ce47d47c63dca86d Mon Sep 17 00:00:00 2001
From: King_DuckZ <king_duckz@gmx.com>
Date: Thu, 12 Nov 2015 14:07:26 +0000
Subject: [PATCH] Add optional verbose mode that shows the number of hashed
 files.

---
 CMakeLists.txt           |  4 ++++
 find_duplicate_files.sql |  7 ++++++
 src/commandline.cpp      |  3 +++
 src/indexer.cpp          | 48 +++++++++++++++++++++++++++++++++++-----
 src/indexer.hpp          |  9 ++++++++
 src/main.cpp             | 46 +++++++++++++++++++++++++++++++++++++-
 6 files changed, 110 insertions(+), 7 deletions(-)
 create mode 100644 find_duplicate_files.sql
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 947a19b..a741ad8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,3 +47,7 @@ target_link_libraries(${PROJECT_NAME}
 	${YAMLCPP_LIBRARY}
 	${Boost_LIBRARIES}
 )
+
+target_compile_definitions(${PROJECT_NAME}
+	PUBLIC WITH_PROGRESS_FEEDBACK
+)
diff --git a/find_duplicate_files.sql b/find_duplicate_files.sql
new file mode 100644
index 0000000..2760e16
--- /dev/null
+++ b/find_duplicate_files.sql
@@ -0,0 +1,7 @@
+﻿--select hash, group_id, count(hash) as hash_count from files group by hash, group_id having count(*) > 1 order by hash;
+
+--select hash, group_id from files group by hash, group_id having count(*) > 1 and count(order by hash;
+
+select files.hash, group_id, t.ct from files inner join (
+	select hash, count(*) as ct from files group by hash having count(distinct group_id) > 1
+) t on t.hash = files.hash group by files.hash, group_id, t.ct order by files.hash;
diff --git a/src/commandline.cpp b/src/commandline.cpp
index ea97784..ce319e1 100644
--- a/src/commandline.cpp
+++ b/src/commandline.cpp
@@ -60,6 +60,9 @@ namespace din {
 			("help,h", "Produces this help message")
 			("version", "Prints the program's version and quits")
 			//("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
+#if defined(WITH_PROGRESS_FEEDBACK)
+			("quiet,q", "Hide progress messages and print nothing at all")
+#endif
 		;
 		po::options_description set_options("Set options");
 		set_options.add_options()
diff --git a/src/indexer.cpp b/src/indexer.cpp
index a3ea6b8..7290ffe 100644
--- a/src/indexer.cpp
+++ b/src/indexer.cpp
@@ -21,9 +21,13 @@
 #include "dbbackend.hpp"
 #include "settings.hpp"
 #include <algorithm>
+#include <functional>
 #include <vector>
 #include <string>
-#include <atomic>
+#if defined(WITH_PROGRESS_FEEDBACK)
+#	include <atomic>
+#	include <condition_variable>
+#endif
 #include <cstdint>
 #include <ciso646>
 #include <cassert>
@@ -61,7 +65,7 @@ namespace din {
 	};
 
 	namespace {
-		void hash_dir (std::vector<FileEntry>::iterator parEntry, std::vector<FileEntry>::iterator parEnd, const PathName& parCurrDir, std::atomic<std::size_t>& parDone) {
+		void hash_dir (std::vector<FileEntry>::iterator parEntry, std::vector<FileEntry>::iterator parEnd, const PathName& parCurrDir, std::function<void()> parItemDoneCallback) {
 			assert(parEntry != parEnd);
 			assert(parEntry->is_dir);
 			FileEntry& curr_entry = *parEntry;
@@ -90,7 +94,7 @@ namespace din {
 				while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
 					PathName curr_subdir(it_entry->path);
 					if (it_entry->is_dir) {
-						hash_dir(it_entry, parEnd, curr_subdir, parDone);
+						hash_dir(it_entry, parEnd, curr_subdir, parItemDoneCallback);
 
 						std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
 						const auto old_size = dir_blob.size();
@@ -133,7 +137,7 @@ namespace din {
 					std::cout << "Hashing file " << it_entry->path << "...";
 #endif
 					tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
-					++parDone;
+					parItemDoneCallback();
 #if defined(INDEXER_VERBOSE)
 					std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
 #endif
@@ -144,7 +148,7 @@ namespace din {
 #if defined(INDEXER_VERBOSE)
 			std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n';
 #endif
-			++parDone;
+			//parItemDoneCallback();
 		}
 	} //unnamed namespace
 
@@ -153,7 +157,10 @@ namespace din {
 
 		DinDBSettings db_settings;
 		PathList paths;
+#if defined(WITH_PROGRESS_FEEDBACK)
 		std::atomic<std::size_t> done_count;
+		std::condition_variable step_notify;
+#endif
 		std::size_t file_count;
 	};
 
@@ -189,7 +196,9 @@ namespace din {
 		//assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false)));
 		//assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false)));
 #endif
+#if defined(WITH_PROGRESS_FEEDBACK)
 		m_local_data->done_count = 0;
+#endif
 		m_local_data->file_count = 0;
 		m_local_data->db_settings = parDBSettings;
 	}
@@ -201,9 +210,11 @@ namespace din {
 		return m_local_data->file_count;
 	}
 
+#if defined(WITH_PROGRESS_FEEDBACK)
 	std::size_t Indexer::processed_items() const {
 		return m_local_data->done_count;
 	}
+#endif
 
 	void Indexer::calculate_hash() {
 		PathName base_path(m_local_data->paths.front().path);
@@ -223,10 +234,27 @@ namespace din {
 		std::cout << "-----------------------------------------------------\n";
 #endif
 
+#if defined(WITH_PROGRESS_FEEDBACK)
 		m_local_data->done_count = 0;
-		hash_dir(m_local_data->paths.begin(), m_local_data->paths.end(), base_path, m_local_data->done_count);
+		hash_dir(
+			m_local_data->paths.begin(),
+			m_local_data->paths.end(),
+			base_path,
+			[=]() {
+				++m_local_data->done_count;
+				m_local_data->step_notify.notify_all();
+			}
+		);
 
 		assert(m_local_data->done_count == m_local_data->paths.size());
+#else
+		hash_dir(
+			m_local_data->paths.begin(),
+			m_local_data->paths.end(),
+			base_path,
+			[]() {}
+		);
+#endif
 
 #if defined(INDEXER_VERBOSE)
 		for (const auto& itm : m_local_data->paths) {
@@ -236,7 +264,9 @@ namespace din {
 	}
 
 	void Indexer::add_to_db (const std::string& parSetName, char parType) const {
+#if defined(WITH_PROGRESS_FEEDBACK)
 		assert(m_local_data->done_count == m_local_data->paths.size());
+#endif
 		PathName base_path(m_local_data->paths.front().path);
 		std::vector<FileRecordData> data;
 		data.reserve(m_local_data->paths.size());
@@ -287,4 +317,10 @@ namespace din {
 	bool Indexer::empty() const {
 		return m_local_data->paths.size() < 2;
 	}
+
+#if defined(WITH_PROGRESS_FEEDBACK)
+	std::condition_variable& Indexer::step_notify() {
+		return m_local_data->step_notify;
+	}
+#endif
 } //namespace din
diff --git a/src/indexer.hpp b/src/indexer.hpp
index 7985abd..0ce3f56 100644
--- a/src/indexer.hpp
+++ b/src/indexer.hpp
@@ -24,6 +24,12 @@
 #	define INDEXER_VERBOSE
 #endif
 
+#if defined(WITH_PROGRESS_FEEDBACK)
+namespace std {
+	class condition_variable;
+} //namespace std
+#endif
+
 namespace din {
 	struct DinDBSettings;
 
@@ -40,7 +46,10 @@ namespace din {
 #endif
 
 		std::size_t total_items ( void ) const;
+#if defined(WITH_PROGRESS_FEEDBACK)
 		std::size_t processed_items ( void ) const;
+		std::condition_variable& step_notify ( void );
+#endif
 		void calculate_hash ( void );
 		void add_to_db ( const std::string& parSetName, char parType ) const;
 		bool empty ( void ) const;
diff --git a/src/main.cpp b/src/main.cpp
index a63db57..76e15e3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -17,11 +17,20 @@
 
 #include <iostream>
 #include <ciso646>
+#if defined(WITH_PROGRESS_FEEDBACK)
+#	include <thread>
+#	include <mutex>
+#	include <condition_variable>
+#endif
 #include "filesearcher.hpp"
 #include "indexer.hpp"
 #include "settings.hpp"
 #include "commandline.hpp"
 
+namespace {
+	void run_hash_calculation ( din::Indexer& parIndexer, bool parShowProgress );
+} //unnamed namespace
+
 int main (int parArgc, char* parArgv[]) {
 	using std::placeholders::_1;
 	using std::placeholders::_2;
@@ -62,8 +71,43 @@ int main (int parArgc, char* parArgv[]) {
 		return 1;
 	}
 	else {
-		indexer.calculate_hash();
+#if defined(WITH_PROGRESS_FEEDBACK)
+		const bool verbose = (0 == vm.count("quiet"));
+#else
+		const bool verbose = false;
+#endif
+		run_hash_calculation(indexer, verbose);
 		indexer.add_to_db(vm["setname"].as<std::string>(), vm["type"].as<char>());
 	}
 	return 0;
 }
+
+namespace {
+	void run_hash_calculation (din::Indexer& parIndexer, bool parShowProgress) {
+#if !defined(WITH_PROGRESS_FEEDBACK)
+		parShowProgress = false;
+#endif
+		if (not parShowProgress) {
+			parIndexer.calculate_hash();
+		}
+#if defined(WITH_PROGRESS_FEEDBACK)
+		else {
+			std::cout << "Fetching items list...\n";
+			const auto total_items = parIndexer.total_items();
+			std::thread hash_thread(&din::Indexer::calculate_hash, &parIndexer);
+			std::mutex progress_print;
+			while (parIndexer.processed_items() != total_items) {
+				std::unique_lock<std::mutex> lk(progress_print);
+				parIndexer.step_notify().wait(lk);
+				std::cout << "Processed " << parIndexer.processed_items() << " of " << total_items << '\r';
+				std::cout.flush();
+			};
+
+			hash_thread.join();
+			if (parIndexer.processed_items() > 0) {
+				std::cout << '\n';
+			}
+		}
+#endif
+	}
+} //unnamed namespace