mirror of
https://github.com/KingDuckZ/dindexer.git
synced 2024-11-25 00:53:43 +00:00
Add optional verbose mode that shows the number of hashed files.
This commit is contained in:
parent
a82ab4a4ed
commit
f7441292bc
6 changed files with 110 additions and 7 deletions
|
@ -47,3 +47,7 @@ target_link_libraries(${PROJECT_NAME}
|
||||||
${YAMLCPP_LIBRARY}
|
${YAMLCPP_LIBRARY}
|
||||||
${Boost_LIBRARIES}
|
${Boost_LIBRARIES}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
target_compile_definitions(${PROJECT_NAME}
|
||||||
|
PUBLIC WITH_PROGRESS_FEEDBACK
|
||||||
|
)
|
||||||
|
|
7
find_duplicate_files.sql
Normal file
7
find_duplicate_files.sql
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
--select hash, group_id, count(hash) as hash_count from files group by hash, group_id having count(*) > 1 order by hash;
|
||||||
|
|
||||||
|
--select hash, group_id from files group by hash, group_id having count(*) > 1 and count(order by hash;
|
||||||
|
|
||||||
|
select files.hash, group_id, t.ct from files inner join (
|
||||||
|
select hash, count(*) as ct from files group by hash having count(distinct group_id) > 1
|
||||||
|
) t on t.hash = files.hash group by files.hash, group_id, t.ct order by files.hash;
|
|
@ -60,6 +60,9 @@ namespace din {
|
||||||
("help,h", "Produces this help message")
|
("help,h", "Produces this help message")
|
||||||
("version", "Prints the program's version and quits")
|
("version", "Prints the program's version and quits")
|
||||||
//("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
|
//("dump-raw,D", po::value<std::string>(), "Saves the retrieved html to the named file; use - for stdout")
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
("quiet,q", "Hide progress messages and print nothing at all")
|
||||||
|
#endif
|
||||||
;
|
;
|
||||||
po::options_description set_options("Set options");
|
po::options_description set_options("Set options");
|
||||||
set_options.add_options()
|
set_options.add_options()
|
||||||
|
|
|
@ -21,9 +21,13 @@
|
||||||
#include "dbbackend.hpp"
|
#include "dbbackend.hpp"
|
||||||
#include "settings.hpp"
|
#include "settings.hpp"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <functional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <atomic>
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
# include <atomic>
|
||||||
|
# include <condition_variable>
|
||||||
|
#endif
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <ciso646>
|
#include <ciso646>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -61,7 +65,7 @@ namespace din {
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void hash_dir (std::vector<FileEntry>::iterator parEntry, std::vector<FileEntry>::iterator parEnd, const PathName& parCurrDir, std::atomic<std::size_t>& parDone) {
|
void hash_dir (std::vector<FileEntry>::iterator parEntry, std::vector<FileEntry>::iterator parEnd, const PathName& parCurrDir, std::function<void()> parItemDoneCallback) {
|
||||||
assert(parEntry != parEnd);
|
assert(parEntry != parEnd);
|
||||||
assert(parEntry->is_dir);
|
assert(parEntry->is_dir);
|
||||||
FileEntry& curr_entry = *parEntry;
|
FileEntry& curr_entry = *parEntry;
|
||||||
|
@ -90,7 +94,7 @@ namespace din {
|
||||||
while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
|
while (parEnd != it_entry and it_entry->level == curr_entry_it->level + 1 and parCurrDir == PathName(it_entry->path).pop_right()) {
|
||||||
PathName curr_subdir(it_entry->path);
|
PathName curr_subdir(it_entry->path);
|
||||||
if (it_entry->is_dir) {
|
if (it_entry->is_dir) {
|
||||||
hash_dir(it_entry, parEnd, curr_subdir, parDone);
|
hash_dir(it_entry, parEnd, curr_subdir, parItemDoneCallback);
|
||||||
|
|
||||||
std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
|
std::string relpath = make_relative_path(parCurrDir, curr_subdir).path();
|
||||||
const auto old_size = dir_blob.size();
|
const auto old_size = dir_blob.size();
|
||||||
|
@ -133,7 +137,7 @@ namespace din {
|
||||||
std::cout << "Hashing file " << it_entry->path << "...";
|
std::cout << "Hashing file " << it_entry->path << "...";
|
||||||
#endif
|
#endif
|
||||||
tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
|
tiger_file(it_entry->path, it_entry->hash, curr_entry_it->hash, it_entry->file_size);
|
||||||
++parDone;
|
parItemDoneCallback();
|
||||||
#if defined(INDEXER_VERBOSE)
|
#if defined(INDEXER_VERBOSE)
|
||||||
std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
|
std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n';
|
||||||
#endif
|
#endif
|
||||||
|
@ -144,7 +148,7 @@ namespace din {
|
||||||
#if defined(INDEXER_VERBOSE)
|
#if defined(INDEXER_VERBOSE)
|
||||||
std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n';
|
std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(curr_entry_it->hash) << '\n';
|
||||||
#endif
|
#endif
|
||||||
++parDone;
|
//parItemDoneCallback();
|
||||||
}
|
}
|
||||||
} //unnamed namespace
|
} //unnamed namespace
|
||||||
|
|
||||||
|
@ -153,7 +157,10 @@ namespace din {
|
||||||
|
|
||||||
DinDBSettings db_settings;
|
DinDBSettings db_settings;
|
||||||
PathList paths;
|
PathList paths;
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
std::atomic<std::size_t> done_count;
|
std::atomic<std::size_t> done_count;
|
||||||
|
std::condition_variable step_notify;
|
||||||
|
#endif
|
||||||
std::size_t file_count;
|
std::size_t file_count;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -189,7 +196,9 @@ namespace din {
|
||||||
//assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false)));
|
//assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false)));
|
||||||
//assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false)));
|
//assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false)));
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
m_local_data->done_count = 0;
|
m_local_data->done_count = 0;
|
||||||
|
#endif
|
||||||
m_local_data->file_count = 0;
|
m_local_data->file_count = 0;
|
||||||
m_local_data->db_settings = parDBSettings;
|
m_local_data->db_settings = parDBSettings;
|
||||||
}
|
}
|
||||||
|
@ -201,9 +210,11 @@ namespace din {
|
||||||
return m_local_data->file_count;
|
return m_local_data->file_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
std::size_t Indexer::processed_items() const {
|
std::size_t Indexer::processed_items() const {
|
||||||
return m_local_data->done_count;
|
return m_local_data->done_count;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void Indexer::calculate_hash() {
|
void Indexer::calculate_hash() {
|
||||||
PathName base_path(m_local_data->paths.front().path);
|
PathName base_path(m_local_data->paths.front().path);
|
||||||
|
@ -223,10 +234,27 @@ namespace din {
|
||||||
std::cout << "-----------------------------------------------------\n";
|
std::cout << "-----------------------------------------------------\n";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
m_local_data->done_count = 0;
|
m_local_data->done_count = 0;
|
||||||
hash_dir(m_local_data->paths.begin(), m_local_data->paths.end(), base_path, m_local_data->done_count);
|
hash_dir(
|
||||||
|
m_local_data->paths.begin(),
|
||||||
|
m_local_data->paths.end(),
|
||||||
|
base_path,
|
||||||
|
[=]() {
|
||||||
|
++m_local_data->done_count;
|
||||||
|
m_local_data->step_notify.notify_all();
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
assert(m_local_data->done_count == m_local_data->paths.size());
|
assert(m_local_data->done_count == m_local_data->paths.size());
|
||||||
|
#else
|
||||||
|
hash_dir(
|
||||||
|
m_local_data->paths.begin(),
|
||||||
|
m_local_data->paths.end(),
|
||||||
|
base_path,
|
||||||
|
[]() {}
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(INDEXER_VERBOSE)
|
#if defined(INDEXER_VERBOSE)
|
||||||
for (const auto& itm : m_local_data->paths) {
|
for (const auto& itm : m_local_data->paths) {
|
||||||
|
@ -236,7 +264,9 @@ namespace din {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Indexer::add_to_db (const std::string& parSetName, char parType) const {
|
void Indexer::add_to_db (const std::string& parSetName, char parType) const {
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
assert(m_local_data->done_count == m_local_data->paths.size());
|
assert(m_local_data->done_count == m_local_data->paths.size());
|
||||||
|
#endif
|
||||||
PathName base_path(m_local_data->paths.front().path);
|
PathName base_path(m_local_data->paths.front().path);
|
||||||
std::vector<FileRecordData> data;
|
std::vector<FileRecordData> data;
|
||||||
data.reserve(m_local_data->paths.size());
|
data.reserve(m_local_data->paths.size());
|
||||||
|
@ -287,4 +317,10 @@ namespace din {
|
||||||
bool Indexer::empty() const {
|
bool Indexer::empty() const {
|
||||||
return m_local_data->paths.size() < 2;
|
return m_local_data->paths.size() < 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
std::condition_variable& Indexer::step_notify() {
|
||||||
|
return m_local_data->step_notify;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
} //namespace din
|
} //namespace din
|
||||||
|
|
|
@ -24,6 +24,12 @@
|
||||||
# define INDEXER_VERBOSE
|
# define INDEXER_VERBOSE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
namespace std {
|
||||||
|
class condition_variable;
|
||||||
|
} //namespace std
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace din {
|
namespace din {
|
||||||
struct DinDBSettings;
|
struct DinDBSettings;
|
||||||
|
|
||||||
|
@ -40,7 +46,10 @@ namespace din {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
std::size_t total_items ( void ) const;
|
std::size_t total_items ( void ) const;
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
std::size_t processed_items ( void ) const;
|
std::size_t processed_items ( void ) const;
|
||||||
|
std::condition_variable& step_notify ( void );
|
||||||
|
#endif
|
||||||
void calculate_hash ( void );
|
void calculate_hash ( void );
|
||||||
void add_to_db ( const std::string& parSetName, char parType ) const;
|
void add_to_db ( const std::string& parSetName, char parType ) const;
|
||||||
bool empty ( void ) const;
|
bool empty ( void ) const;
|
||||||
|
|
46
src/main.cpp
46
src/main.cpp
|
@ -17,11 +17,20 @@
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <ciso646>
|
#include <ciso646>
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
# include <thread>
|
||||||
|
# include <mutex>
|
||||||
|
# include <condition_variable>
|
||||||
|
#endif
|
||||||
#include "filesearcher.hpp"
|
#include "filesearcher.hpp"
|
||||||
#include "indexer.hpp"
|
#include "indexer.hpp"
|
||||||
#include "settings.hpp"
|
#include "settings.hpp"
|
||||||
#include "commandline.hpp"
|
#include "commandline.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
void run_hash_calculation ( din::Indexer& parIndexer, bool parShowProgress );
|
||||||
|
} //unnamed namespace
|
||||||
|
|
||||||
int main (int parArgc, char* parArgv[]) {
|
int main (int parArgc, char* parArgv[]) {
|
||||||
using std::placeholders::_1;
|
using std::placeholders::_1;
|
||||||
using std::placeholders::_2;
|
using std::placeholders::_2;
|
||||||
|
@ -62,8 +71,43 @@ int main (int parArgc, char* parArgv[]) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
indexer.calculate_hash();
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
const bool verbose = (0 == vm.count("quiet"));
|
||||||
|
#else
|
||||||
|
const bool verbose = false;
|
||||||
|
#endif
|
||||||
|
run_hash_calculation(indexer, verbose);
|
||||||
indexer.add_to_db(vm["setname"].as<std::string>(), vm["type"].as<char>());
|
indexer.add_to_db(vm["setname"].as<std::string>(), vm["type"].as<char>());
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
void run_hash_calculation (din::Indexer& parIndexer, bool parShowProgress) {
|
||||||
|
#if !defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
parShowProgress = false;
|
||||||
|
#endif
|
||||||
|
if (not parShowProgress) {
|
||||||
|
parIndexer.calculate_hash();
|
||||||
|
}
|
||||||
|
#if defined(WITH_PROGRESS_FEEDBACK)
|
||||||
|
else {
|
||||||
|
std::cout << "Fetching items list...\n";
|
||||||
|
const auto total_items = parIndexer.total_items();
|
||||||
|
std::thread hash_thread(&din::Indexer::calculate_hash, &parIndexer);
|
||||||
|
std::mutex progress_print;
|
||||||
|
while (parIndexer.processed_items() != total_items) {
|
||||||
|
std::unique_lock<std::mutex> lk(progress_print);
|
||||||
|
parIndexer.step_notify().wait(lk);
|
||||||
|
std::cout << "Processed " << parIndexer.processed_items() << " of " << total_items << '\r';
|
||||||
|
std::cout.flush();
|
||||||
|
};
|
||||||
|
|
||||||
|
hash_thread.join();
|
||||||
|
if (parIndexer.processed_items() > 0) {
|
||||||
|
std::cout << '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
} //unnamed namespace
|
||||||
|
|
Loading…
Reference in a new issue