From f7441292bc1d6a42b3198a60ce47d47c63dca86d Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Thu, 12 Nov 2015 14:07:26 +0000 Subject: [PATCH] Add optional verbose mode that shows the number of hashed files. --- CMakeLists.txt | 4 ++++ find_duplicate_files.sql | 7 ++++++ src/commandline.cpp | 3 +++ src/indexer.cpp | 48 +++++++++++++++++++++++++++++++++++----- src/indexer.hpp | 9 ++++++++ src/main.cpp | 46 +++++++++++++++++++++++++++++++++++++- 6 files changed, 110 insertions(+), 7 deletions(-) create mode 100644 find_duplicate_files.sql diff --git a/CMakeLists.txt b/CMakeLists.txt index 947a19b..a741ad8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,3 +47,7 @@ target_link_libraries(${PROJECT_NAME} ${YAMLCPP_LIBRARY} ${Boost_LIBRARIES} ) + +target_compile_definitions(${PROJECT_NAME} + PUBLIC WITH_PROGRESS_FEEDBACK +) diff --git a/find_duplicate_files.sql b/find_duplicate_files.sql new file mode 100644 index 0000000..2760e16 --- /dev/null +++ b/find_duplicate_files.sql @@ -0,0 +1,7 @@ +--select hash, group_id, count(hash) as hash_count from files group by hash, group_id having count(*) > 1 order by hash; + +--select hash, group_id from files group by hash, group_id having count(*) > 1 and count(order by hash; + +select files.hash, group_id, t.ct from files inner join ( + select hash, count(*) as ct from files group by hash having count(distinct group_id) > 1 +) t on t.hash = files.hash group by files.hash, group_id, t.ct order by files.hash; diff --git a/src/commandline.cpp b/src/commandline.cpp index ea97784..ce319e1 100644 --- a/src/commandline.cpp +++ b/src/commandline.cpp @@ -60,6 +60,9 @@ namespace din { ("help,h", "Produces this help message") ("version", "Prints the program's version and quits") //("dump-raw,D", po::value(), "Saves the retrieved html to the named file; use - for stdout") +#if defined(WITH_PROGRESS_FEEDBACK) + ("quiet,q", "Hide progress messages and print nothing at all") +#endif ; po::options_description set_options("Set options"); set_options.add_options() diff --git a/src/indexer.cpp b/src/indexer.cpp index a3ea6b8..7290ffe 100644 --- a/src/indexer.cpp +++ b/src/indexer.cpp @@ -21,9 +21,13 @@ #include "dbbackend.hpp" #include "settings.hpp" #include +#include #include #include -#include +#if defined(WITH_PROGRESS_FEEDBACK) +# include +# include +#endif #include #include #include @@ -61,7 +65,7 @@ namespace din { }; namespace { - void hash_dir (std::vector::iterator parEntry, std::vector::iterator parEnd, const PathName& parCurrDir, std::atomic& parDone) { + void hash_dir (std::vector::iterator parEntry, std::vector::iterator parEnd, const PathName& parCurrDir, std::function parItemDoneCallback) { assert(parEntry != parEnd); assert(parEntry->is_dir); FileEntry& curr_entry = *parEntry; @@ -90,7 +94,7 @@ namespace din { while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) { PathName curr_subdir(it_entry->path); if (it_entry->is_dir) { - hash_dir(it_entry, parEnd, curr_subdir, parDone); + hash_dir(it_entry, parEnd, curr_subdir, parItemDoneCallback); std::string relpath = make_relative_path(parCurrDir, curr_subdir).path(); const auto old_size = dir_blob.size(); @@ -133,7 +137,7 @@ namespace din { std::cout << "Hashing file " << it_entry->path << "..."; #endif tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size); - ++parDone; + parItemDoneCallback(); #if defined(INDEXER_VERBOSE) std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n'; #endif @@ -144,7 +148,7 @@ namespace din { #if defined(INDEXER_VERBOSE) std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n'; #endif - ++parDone; + //parItemDoneCallback(); } } //unnamed namespace @@ -153,7 +157,10 @@ namespace din { DinDBSettings db_settings; PathList paths; +#if defined(WITH_PROGRESS_FEEDBACK) std::atomic done_count; + std::condition_variable step_notify; +#endif std::size_t file_count; }; @@ -189,7 +196,9 @@ namespace din { //assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false))); //assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false))); #endif +#if defined(WITH_PROGRESS_FEEDBACK) m_local_data->done_count = 0; +#endif m_local_data->file_count = 0; m_local_data->db_settings = parDBSettings; } @@ -201,9 +210,11 @@ namespace din { return m_local_data->file_count; } +#if defined(WITH_PROGRESS_FEEDBACK) std::size_t Indexer::processed_items() const { return m_local_data->done_count; } +#endif void Indexer::calculate_hash() { PathName base_path(m_local_data->paths.front().path); @@ -223,10 +234,27 @@ namespace din { std::cout << "-----------------------------------------------------\n"; #endif +#if defined(WITH_PROGRESS_FEEDBACK) m_local_data->done_count = 0; - hash_dir(m_local_data->paths.begin(), m_local_data->paths.end(), base_path, m_local_data->done_count); + hash_dir( + m_local_data->paths.begin(), + m_local_data->paths.end(), + base_path, + [=]() { + ++m_local_data->done_count; + m_local_data->step_notify.notify_all(); + } + ); assert(m_local_data->done_count == m_local_data->paths.size()); +#else + hash_dir( + m_local_data->paths.begin(), + m_local_data->paths.end(), + base_path, + []() {} + ); +#endif #if defined(INDEXER_VERBOSE) for (const auto& itm : m_local_data->paths) { @@ -236,7 +264,9 @@ namespace din { } void Indexer::add_to_db (const std::string& parSetName, char parType) const { +#if defined(WITH_PROGRESS_FEEDBACK) assert(m_local_data->done_count == m_local_data->paths.size()); +#endif PathName base_path(m_local_data->paths.front().path); std::vector data; data.reserve(m_local_data->paths.size()); @@ -287,4 +317,10 @@ namespace din { bool Indexer::empty() const { return m_local_data->paths.size() < 2; } + +#if defined(WITH_PROGRESS_FEEDBACK) + std::condition_variable& Indexer::step_notify() { + return m_local_data->step_notify; + } +#endif } //namespace din diff --git a/src/indexer.hpp b/src/indexer.hpp index 7985abd..0ce3f56 100644 --- a/src/indexer.hpp +++ b/src/indexer.hpp @@ -24,6 +24,12 @@ # define INDEXER_VERBOSE #endif +#if defined(WITH_PROGRESS_FEEDBACK) +namespace std { + class condition_variable; +} //namespace std +#endif + namespace din { struct DinDBSettings; @@ -40,7 +46,10 @@ namespace din { #endif std::size_t total_items ( void ) const; +#if defined(WITH_PROGRESS_FEEDBACK) std::size_t processed_items ( void ) const; + std::condition_variable& step_notify ( void ); +#endif void calculate_hash ( void ); void add_to_db ( const std::string& parSetName, char parType ) const; bool empty ( void ) const; diff --git a/src/main.cpp b/src/main.cpp index a63db57..76e15e3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -17,11 +17,20 @@ #include #include +#if defined(WITH_PROGRESS_FEEDBACK) +# include +# include +# include +#endif #include "filesearcher.hpp" #include "indexer.hpp" #include "settings.hpp" #include "commandline.hpp" +namespace { + void run_hash_calculation ( din::Indexer& parIndexer, bool parShowProgress ); +} //unnamed namespace + int main (int parArgc, char* parArgv[]) { using std::placeholders::_1; using std::placeholders::_2; @@ -62,8 +71,43 @@ int main (int parArgc, char* parArgv[]) { return 1; } else { - indexer.calculate_hash(); +#if defined(WITH_PROGRESS_FEEDBACK) + const bool verbose = (0 == vm.count("quiet")); +#else + const bool verbose = false; +#endif + run_hash_calculation(indexer, verbose); indexer.add_to_db(vm["setname"].as(), vm["type"].as()); } return 0; } + +namespace { + void run_hash_calculation (din::Indexer& parIndexer, bool parShowProgress) { +#if !defined(WITH_PROGRESS_FEEDBACK) + parShowProgress = false; +#endif + if (not parShowProgress) { + parIndexer.calculate_hash(); + } +#if defined(WITH_PROGRESS_FEEDBACK) + else { + std::cout << "Fetching items list...\n"; + const auto total_items = parIndexer.total_items(); + std::thread hash_thread(&din::Indexer::calculate_hash, &parIndexer); + std::mutex progress_print; + while (parIndexer.processed_items() != total_items) { + std::unique_lock lk(progress_print); + parIndexer.step_notify().wait(lk); + std::cout << "Processed " << parIndexer.processed_items() << " of " << total_items << '\r'; + std::cout.flush(); + }; + + hash_thread.join(); + if (parIndexer.processed_items() > 0) { + std::cout << '\n'; + } + } +#endif + } +} //unnamed namespace