1
0
Fork 0
mirror of https://github.com/KingDuckZ/dindexer.git synced 2024-11-25 00:53:43 +00:00

Add optional verbose mode that shows the number of hashed files.

This commit is contained in:
King_DuckZ 2015-11-12 14:07:26 +00:00
parent a82ab4a4ed
commit f7441292bc
6 changed files with 110 additions and 7 deletions

View file

@ -47,3 +47,7 @@ target_link_libraries(${PROJECT_NAME}
${YAMLCPP_LIBRARY} ${YAMLCPP_LIBRARY}
${Boost_LIBRARIES} ${Boost_LIBRARIES}
) )
target_compile_definitions(${PROJECT_NAME}
PUBLIC WITH_PROGRESS_FEEDBACK
)

7
find_duplicate_files.sql Normal file
View file

@ -0,0 +1,7 @@
--select hash, group_id, count(hash) as hash_count from files group by hash, group_id having count(*) > 1 order by hash;
--select hash, group_id from files group by hash, group_id having count(*) > 1 and count(order by hash;
select files.hash, group_id, t.ct from files inner join (
select hash, count(*) as ct from files group by hash having count(distinct group_id) > 1
) t on t.hash = files.hash group by files.hash, group_id, t.ct order by files.hash;

View file

@ -60,6 +60,9 @@ namespace din {
("help,h", "Produces this help message") ("help,h", "Produces this help message")
("version", "Prints the program's version and quits") ("version", "Prints the program's version and quits")
//("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout") //("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
#if defined(WITH_PROGRESS_FEEDBACK)
("quiet,q", "Hide progress messages and print nothing at all")
#endif
; ;
po::options_description set_options("Set options"); po::options_description set_options("Set options");
set_options.add_options() set_options.add_options()

View file

@ -21,9 +21,13 @@
#include "dbbackend.hpp" #include "dbbackend.hpp"
#include "settings.hpp" #include "settings.hpp"
#include <algorithm> #include <algorithm>
#include <functional>
#include <vector> #include <vector>
#include <string> #include <string>
#if defined(WITH_PROGRESS_FEEDBACK)
# include <atomic> # include <atomic>
# include <condition_variable>
#endif
#include <cstdint> #include <cstdint>
#include <ciso646> #include <ciso646>
#include <cassert> #include <cassert>
@ -61,7 +65,7 @@ namespace din {
}; };
namespace { namespace {
void hash_dir (std::vector<FileEntry>::iterator parEntry, std::vector<FileEntry>::iterator parEnd, const PathName& parCurrDir, std::atomic<std::size_t>& parDone) { void hash_dir (std::vector<FileEntry>::iterator parEntry, std::vector<FileEntry>::iterator parEnd, const PathName& parCurrDir, std::function<void()> parItemDoneCallback) {
assert(parEntry != parEnd); assert(parEntry != parEnd);
assert(parEntry->is_dir); assert(parEntry->is_dir);
FileEntry& curr_entry = *parEntry; FileEntry& curr_entry = *parEntry;
@ -90,7 +94,7 @@ namespace din {
while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) { while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
PathName curr_subdir(it_entry->path); PathName curr_subdir(it_entry->path);
if (it_entry->is_dir) { if (it_entry->is_dir) {
hash_dir(it_entry, parEnd, curr_subdir, parDone); hash_dir(it_entry, parEnd, curr_subdir, parItemDoneCallback);
std::string relpath = make_relative_path(parCurrDir, curr_subdir).path(); std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
const auto old_size = dir_blob.size(); const auto old_size = dir_blob.size();
@ -133,7 +137,7 @@ namespace din {
std::cout << "Hashing file " << it_entry->path << "..."; std::cout << "Hashing file " << it_entry->path << "...";
#endif #endif
tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size); tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
++parDone; parItemDoneCallback();
#if defined(INDEXER_VERBOSE) #if defined(INDEXER_VERBOSE)
std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n'; std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
#endif #endif
@ -144,7 +148,7 @@ namespace din {
#if defined(INDEXER_VERBOSE) #if defined(INDEXER_VERBOSE)
std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n'; std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n';
#endif #endif
++parDone; //parItemDoneCallback();
} }
} //unnamed namespace } //unnamed namespace
@ -153,7 +157,10 @@ namespace din {
DinDBSettings db_settings; DinDBSettings db_settings;
PathList paths; PathList paths;
#if defined(WITH_PROGRESS_FEEDBACK)
std::atomic<std::size_t> done_count; std::atomic<std::size_t> done_count;
std::condition_variable step_notify;
#endif
std::size_t file_count; std::size_t file_count;
}; };
@ -189,7 +196,9 @@ namespace din {
//assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false))); //assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false)));
//assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false))); //assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false)));
#endif #endif
#if defined(WITH_PROGRESS_FEEDBACK)
m_local_data->done_count = 0; m_local_data->done_count = 0;
#endif
m_local_data->file_count = 0; m_local_data->file_count = 0;
m_local_data->db_settings = parDBSettings; m_local_data->db_settings = parDBSettings;
} }
@ -201,9 +210,11 @@ namespace din {
return m_local_data->file_count; return m_local_data->file_count;
} }
#if defined(WITH_PROGRESS_FEEDBACK)
std::size_t Indexer::processed_items() const { std::size_t Indexer::processed_items() const {
return m_local_data->done_count; return m_local_data->done_count;
} }
#endif
void Indexer::calculate_hash() { void Indexer::calculate_hash() {
PathName base_path(m_local_data->paths.front().path); PathName base_path(m_local_data->paths.front().path);
@ -223,10 +234,27 @@ namespace din {
std::cout << "-----------------------------------------------------\n"; std::cout << "-----------------------------------------------------\n";
#endif #endif
#if defined(WITH_PROGRESS_FEEDBACK)
m_local_data->done_count = 0; m_local_data->done_count = 0;
hash_dir(m_local_data->paths.begin(), m_local_data->paths.end(), base_path, m_local_data->done_count); hash_dir(
m_local_data->paths.begin(),
m_local_data->paths.end(),
base_path,
[=]() {
++m_local_data->done_count;
m_local_data->step_notify.notify_all();
}
);
assert(m_local_data->done_count == m_local_data->paths.size()); assert(m_local_data->done_count == m_local_data->paths.size());
#else
hash_dir(
m_local_data->paths.begin(),
m_local_data->paths.end(),
base_path,
[]() {}
);
#endif
#if defined(INDEXER_VERBOSE) #if defined(INDEXER_VERBOSE)
for (const auto& itm : m_local_data->paths) { for (const auto& itm : m_local_data->paths) {
@ -236,7 +264,9 @@ namespace din {
} }
void Indexer::add_to_db (const std::string& parSetName, char parType) const { void Indexer::add_to_db (const std::string& parSetName, char parType) const {
#if defined(WITH_PROGRESS_FEEDBACK)
assert(m_local_data->done_count == m_local_data->paths.size()); assert(m_local_data->done_count == m_local_data->paths.size());
#endif
PathName base_path(m_local_data->paths.front().path); PathName base_path(m_local_data->paths.front().path);
std::vector<FileRecordData> data; std::vector<FileRecordData> data;
data.reserve(m_local_data->paths.size()); data.reserve(m_local_data->paths.size());
@ -287,4 +317,10 @@ namespace din {
bool Indexer::empty() const { bool Indexer::empty() const {
return m_local_data->paths.size() < 2; return m_local_data->paths.size() < 2;
} }
#if defined(WITH_PROGRESS_FEEDBACK)
std::condition_variable& Indexer::step_notify() {
return m_local_data->step_notify;
}
#endif
} //namespace din } //namespace din

View file

@ -24,6 +24,12 @@
# define INDEXER_VERBOSE # define INDEXER_VERBOSE
#endif #endif
#if defined(WITH_PROGRESS_FEEDBACK)
namespace std {
class condition_variable;
} //namespace std
#endif
namespace din { namespace din {
struct DinDBSettings; struct DinDBSettings;
@ -40,7 +46,10 @@ namespace din {
#endif #endif
std::size_t total_items ( void ) const; std::size_t total_items ( void ) const;
#if defined(WITH_PROGRESS_FEEDBACK)
std::size_t processed_items ( void ) const; std::size_t processed_items ( void ) const;
std::condition_variable& step_notify ( void );
#endif
void calculate_hash ( void ); void calculate_hash ( void );
void add_to_db ( const std::string& parSetName, char parType ) const; void add_to_db ( const std::string& parSetName, char parType ) const;
bool empty ( void ) const; bool empty ( void ) const;

View file

@ -17,11 +17,20 @@
#include <iostream> #include <iostream>
#include <ciso646> #include <ciso646>
#if defined(WITH_PROGRESS_FEEDBACK)
# include <thread>
# include <mutex>
# include <condition_variable>
#endif
#include "filesearcher.hpp" #include "filesearcher.hpp"
#include "indexer.hpp" #include "indexer.hpp"
#include "settings.hpp" #include "settings.hpp"
#include "commandline.hpp" #include "commandline.hpp"
namespace {
void run_hash_calculation ( din::Indexer& parIndexer, bool parShowProgress );
} //unnamed namespace
int main (int parArgc, char* parArgv[]) { int main (int parArgc, char* parArgv[]) {
using std::placeholders::_1; using std::placeholders::_1;
using std::placeholders::_2; using std::placeholders::_2;
@ -62,8 +71,43 @@ int main (int parArgc, char* parArgv[]) {
return 1; return 1;
} }
else { else {
indexer.calculate_hash(); #if defined(WITH_PROGRESS_FEEDBACK)
const bool verbose = (0 == vm.count("quiet"));
#else
const bool verbose = false;
#endif
run_hash_calculation(indexer, verbose);
indexer.add_to_db(vm["setname"].as<std::string>(), vm["type"].as<char>()); indexer.add_to_db(vm["setname"].as<std::string>(), vm["type"].as<char>());
} }
return 0; return 0;
} }
namespace {
void run_hash_calculation (din::Indexer& parIndexer, bool parShowProgress) {
#if !defined(WITH_PROGRESS_FEEDBACK)
parShowProgress = false;
#endif
if (not parShowProgress) {
parIndexer.calculate_hash();
}
#if defined(WITH_PROGRESS_FEEDBACK)
else {
std::cout << "Fetching items list...\n";
const auto total_items = parIndexer.total_items();
std::thread hash_thread(&din::Indexer::calculate_hash, &parIndexer);
std::mutex progress_print;
while (parIndexer.processed_items() != total_items) {
std::unique_lock<std::mutex> lk(progress_print);
parIndexer.step_notify().wait(lk);
std::cout << "Processed " << parIndexer.processed_items() << " of " << total_items << '\r';
std::cout.flush();
};
hash_thread.join();
if (parIndexer.processed_items() > 0) {
std::cout << '\n';
}
}
#endif
}
} //unnamed namespace