From a00d30b0ee359c00906416dac40edcdaff30c27f Mon Sep 17 00:00:00 2001 From: King_DuckZ Date: Tue, 10 Nov 2015 17:48:22 +0000 Subject: [PATCH] Calculate hash for all entries. --- src/indexer.cpp | 143 ++++++++++++++++++++++++++++++++++++++------- src/pathname.cpp | 103 ++++++++++++++++++++++++-------- src/pathname.hpp | 6 ++ src/stringpool.hpp | 2 + src/stringpool.inl | 25 ++++++++ src/tiger.c | 8 +-- src/tiger.cpp | 30 +++++++++- src/tiger.hpp | 3 + 8 files changed, 269 insertions(+), 51 deletions(-) diff --git a/src/indexer.cpp b/src/indexer.cpp index 94e74a7..ddf9634 100644 --- a/src/indexer.cpp +++ b/src/indexer.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #if !defined(NDEBUG) # include @@ -47,6 +48,7 @@ namespace din { FileEntry& operator= ( const FileEntry& ) = delete; FileEntry& operator= ( FileEntry&& ) = default; bool operator< ( const FileEntry& parOther ) const; + bool operator== ( const FileEntry& ) const = delete; std::string path; HashType hash; @@ -55,23 +57,116 @@ namespace din { bool is_symlink; }; + namespace { + void hash_dir (std::vector::iterator parEntry, std::vector::iterator parEnd, const PathName& parCurrDir) { + FileEntry& curr_entry = *parEntry; + + //Build a blob with the hashes and filenames of every directory that + //is a direct child of current entry + { + std::vector dir_blob; + auto it_entry = parEntry; + + while ( + it_entry != parEnd + and (not it_entry->is_dir or (it_entry->level <= curr_entry.level + and parCurrDir != PathName(it_entry->path).pop_right())) + ) { + ++it_entry; + } + +#if !defined(NDEBUG) + std::cout << "Making initial hash for " << parCurrDir << "...\n"; +#endif + while (parEnd != it_entry and it_entry->is_dir and it_entry->level == parEntry->level + 1) { + PathName curr_subdir(it_entry->path); + hash_dir(it_entry, parEnd, curr_subdir); + + std::string relpath = make_relative_path(parCurrDir, curr_subdir).path(); + const auto old_size = dir_blob.size(); + dir_blob.resize(old_size + sizeof(HashType) + relpath.size()); + std::copy(it_entry->hash.byte_data, it_entry->hash.byte_data + sizeof(HashType), dir_blob.begin() + old_size); + std::copy(relpath.begin(), relpath.end(), dir_blob.begin() + old_size + sizeof(HashType)); + ++it_entry; + } + + tiger_data(dir_blob, curr_entry.hash); +#if !defined(NDEBUG) + std::cout << "Got intermediate hash for dir " << parCurrDir << ": " << tiger_to_string(curr_entry.hash) << '\n'; +#endif + } + + //Now with the initial hash ready, let's start hashing files, if any + { + auto it_entry = parEntry; + while ( + it_entry != parEnd + and (it_entry->is_dir + or it_entry->level != parEntry->level + 1 + or PathName(it_entry->path).pop_right() != parCurrDir + ) + ) { + ++it_entry; + } + + while (it_entry != parEnd and not it_entry->is_dir and it_entry->level == parEntry->level + 1 and PathName(it_entry->path).pop_right() == parCurrDir) { +#if !defined(NDEBUG) + std::cout << "Hashing file " << it_entry->path << "..."; +#endif + tiger_file(it_entry->path, it_entry->hash, parEntry->hash); +#if !defined(NDEBUG) + std::cout << ' ' << tiger_to_string(it_entry->hash) << '\n'; +#endif + ++it_entry; + } + } + +#if !defined(NDEBUG) + std::cout << "Final hash for dir " << parCurrDir << " is " << tiger_to_string(parEntry->hash) << '\n'; +#endif + } + } //unnamed namespace + struct Indexer::LocalData { typedef std::vector PathList; PathList paths; - std::string base_path; std::atomic done_count; std::size_t file_count; }; bool FileEntry::operator< (const FileEntry& parOther) const { - return (this->level < parOther.level) - or (this->level == parOther.level and this->path < parOther.path); + const FileEntry& o = parOther; + return + (level < o.level) + or (level == o.level and is_dir and not o.is_dir) + or (level == o.level and is_dir == o.is_dir and path < o.path) + + //sort by directory - parent first, children later + //(level == o.level and is_dir and not o.is_dir) + //or (level == o.level and is_dir == o.is_dir and path < o.path) + //or (level > o.level + 1) + //or (level + 1 == o.level and is_dir and not o.is_dir and path < o.path) + //or (level + 1 == o.level and is_dir and not o.is_dir and path == PathName(o.path).dirname()) + //or (level == o.level + 1 and not (o.is_dir and not is_dir and o.path == PathName(path).dirname())) + ; } Indexer::Indexer() : m_local_data(new LocalData) { +#if !defined(NDEBUG) + //assert(FileEntry("/a/b/c", 3, true, false) < FileEntry("/a/b", 2, true, false)); + //assert(FileEntry("/a/b/c", 3, true, false) < FileEntry("/a/b/c/file.txt", 4, false, false)); + //assert(FileEntry("/a/b/c", 3, true, false) < FileEntry("/a/b/c/file.c", 4, false, false)); + //assert(FileEntry("/a/b/c/d", 4, true, false) < FileEntry("/a/b", 2, true, false)); + //assert(FileEntry("/a/b/c/d", 4, true, false) < FileEntry("/a/b/c", 3, true, false)); + //assert(FileEntry("/a/b/c/1.txt", 4, true, false) < FileEntry("/a/b/c/2.txt", 4, true, false)); + //assert(not (FileEntry("/a/b/file.txt", 3, false, false) < FileEntry("/a/b", 2, true, false))); + //assert(not (FileEntry("/a", 1, true, false) < FileEntry("/a/b", 2, true, false))); + //assert(not (FileEntry("/a/b/1.txt", 3, false, false) < FileEntry("/a/b/c/f.txt", 4, true, false))); + //assert(not (FileEntry("/a/b/c/file.c", 4, false, false) < FileEntry("/a/b/c", 3, true, false))); +#endif m_local_data->done_count = 0; m_local_data->file_count = 0; } @@ -88,35 +183,43 @@ namespace din { } void Indexer::calculate_hash() { +#if !defined(NDEBUG) std::sort(m_local_data->paths.begin(), m_local_data->paths.end()); + PathName base_path(m_local_data->paths.front().path); + for (auto& itm : m_local_data->paths) { + itm.hash.part_a = 1; + itm.hash.part_b = 1; + itm.hash.part_c = 1; - HashType dir_hash; - tiger_init_hash(dir_hash); - for (auto& cur_itm : m_local_data->paths) { - if (not cur_itm.is_dir) { - std::cout << "Hashing " << cur_itm.path << "..."; - tiger_init_hash(cur_itm.hash); - tiger_file(cur_itm.path, cur_itm.hash, dir_hash); - std::cout << " --> " << tiger_to_string(cur_itm.hash) << '\n'; - } + if (itm.is_dir) + std::cout << "(D) "; + else + std::cout << "(F) "; + std::cout << itm.path << " (" << itm.level << ")\n"; } + std::cout << "-----------------------------------------------------\n"; +#endif + + hash_dir(m_local_data->paths.begin(), m_local_data->paths.end(), base_path); + +#if !defined(NDEBUG) + for (const auto& itm : m_local_data->paths) { + assert(not (1 == itm.hash.part_a and 1 == itm.hash.part_b and 1 == itm.hash.part_c)); + } +#endif } bool Indexer::add_path (const char* parPath, int parLevel, bool parIsDir, bool parIsSymLink) { - if (parLevel > 0) { - m_local_data->paths.push_back(FileEntry(parPath, parLevel, parIsDir, parIsSymLink)); - if (not parIsDir) { - ++m_local_data->file_count; - } - } else { - m_local_data->base_path = parPath; + m_local_data->paths.push_back(FileEntry(parPath, parLevel, parIsDir, parIsSymLink)); + if (not parIsDir) { + ++m_local_data->file_count; } return true; } #if !defined(NDEBUG) void Indexer::dump() const { - PathName base_path(m_local_data->base_path); + PathName base_path(m_local_data->paths.front().path); std::cout << "---------------- FILE LIST ----------------\n"; for (const auto& cur_itm : m_local_data->paths) { diff --git a/src/pathname.cpp b/src/pathname.cpp index e6fcf7d..4920039 100644 --- a/src/pathname.cpp +++ b/src/pathname.cpp @@ -19,11 +19,14 @@ #include #include #include +#include namespace din { const std::string PathName::m_empty_str(""); namespace { + std::string get_joint_atoms ( const StringPool& parPool, bool parAbs, std::size_t parSkipRight=0 ); + bool ptr_between (const char* parPtr, const char* parBeg, const char* parEnd) { std::less less; std::less_equal lesseq; @@ -57,6 +60,36 @@ namespace din { parOut->push_back(parPath.substr(from - beg, next - from)); } } + + std::string get_joint_atoms (const StringPool& parPool, bool parAbs, std::size_t parSkipRight) { + const auto orig_atom_count = parPool.size(); + const auto atom_count = (parSkipRight >= orig_atom_count ? 0 : orig_atom_count - parSkipRight); + if (not atom_count) { + if (parPool.empty() and parAbs) { + return std::string("/"); + } + else { + return std::string(""); + } + } + + std::size_t reserve = (parAbs ? 1 : 0); + for (std::size_t z = 0; z < atom_count; ++z) { + reserve += parPool[z].size(); + } + reserve += atom_count - 1; + + std::string out; + out.reserve(reserve); + const char* slash = (parAbs ? "/" : ""); + for (std::size_t z = 0; z < atom_count; ++z) { + out += slash; + const auto& curr_itm = parPool[z]; + out.insert(out.end(), curr_itm.begin(), curr_itm.end()); + slash = "/"; + } + return std::move(out); + } } //unnamed namespace PathName::PathName (boost::string_ref parPath) { @@ -80,30 +113,7 @@ namespace din { } std::string PathName::path() const { - if (m_pool.empty()) { - if (m_absolute) { - return std::string("/"); - } - else { - return std::string(""); - } - } - - std::size_t reserve = (m_absolute ? 1 : 0); - for (const auto& itm : m_pool) { - reserve += itm.size(); - } - reserve += m_pool.size() - 1; - - std::string out; - out.reserve(reserve); - const char* slash = (m_absolute ? "/" : ""); - for (const auto& itm : m_pool) { - out += slash; - out.insert(out.end(), itm.begin(), itm.end()); - slash = "/"; - } - return std::move(out); + return get_joint_atoms(m_pool, m_absolute); } void PathName::join (const PathName& parOther) { @@ -162,4 +172,49 @@ namespace din { const std::string* PathName::get_stringref_source (std::size_t parIndex) const { return m_pool.get_stringref_source(parIndex); } + + std::string PathName::dirname() const { + if (this->atom_count() == 0) + return std::string(); + + return get_joint_atoms(m_pool, m_absolute, 1); + } + + std::ostream& operator<< (std::ostream& parStream, const PathName& parPath) { + parStream << parPath.path(); + return parStream; + } + + PathName& PathName::pop_right() { + m_pool.pop(); + return *this; + } + + bool PathName::operator!= (const PathName& parOther) const { + const auto count = atom_count(); + if (count != parOther.atom_count()) { + return true; + } + + for (std::size_t z = 0; z < count; ++z) { + if ((*this)[z] != parOther[z]) { + return true; + } + } + return false; + } + + bool PathName::operator== (const PathName& parOther) const { + const auto count = atom_count(); + if (count != parOther.atom_count()) { + return false; + } + + for (std::size_t z = 0; z < count; ++z) { + if ((*this)[z] != parOther[z]) { + return false; + } + } + return true; + } } //namespace din diff --git a/src/pathname.hpp b/src/pathname.hpp index 08dd265..83f141f 100644 --- a/src/pathname.hpp +++ b/src/pathname.hpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace din { class PathName { @@ -41,6 +42,10 @@ namespace din { void join ( const char* parOther ); void join ( boost::string_ref parOther, const std::string* parSource ); const std::string* get_stringref_source ( std::size_t parIndex ) const; + std::string dirname ( void ) const; + PathName& pop_right ( void ); + bool operator!= ( const PathName& parOther ) const; + bool operator== ( const PathName& parOther ) const; private: static const std::string m_empty_str; @@ -51,6 +56,7 @@ namespace din { }; PathName make_relative_path ( const PathName& parBasePath, const PathName& parOtherPath ); + std::ostream& operator<< ( std::ostream& parStream, const PathName& parPath ); } //namespace din #endif diff --git a/src/stringpool.hpp b/src/stringpool.hpp index bd5820a..aba89fb 100644 --- a/src/stringpool.hpp +++ b/src/stringpool.hpp @@ -56,6 +56,8 @@ namespace din { const_iterator begin ( void ) const; const_iterator end ( void ) const; const string_type* get_stringref_source ( std::size_t parIndex ) const; + const stringref_type& operator[] ( std::size_t parIndex ) const; + void pop ( void ); private: PoolType m_pool; diff --git a/src/stringpool.inl b/src/stringpool.inl index 4ccefcb..085c20b 100644 --- a/src/stringpool.inl +++ b/src/stringpool.inl @@ -112,4 +112,29 @@ namespace din { auto StringPool::get_stringref_source (std::size_t parIndex) const -> const string_type* { return m_strings[parIndex].second; } + + template + auto StringPool::operator[] (std::size_t parIndex) const -> const stringref_type& { + return m_strings[parIndex].first; + } + + template + void StringPool::pop() { + if (m_strings.empty()) { + return; + } + + for (auto z = m_pool.size(); z > 0; --z) { + auto& pool_itm = m_pool[z - 1]; + if (&pool_itm.first == m_strings.back().second) { + m_strings.resize(m_strings.size() - 1); + --pool_itm.second; + if (0 == pool_itm.second) { + m_pool.erase(m_pool.begin() + (z - 1)); + } + break; + } + } + return; + } } //namespace din diff --git a/src/tiger.c b/src/tiger.c index 0e8fc5e..1c4e866 100644 --- a/src/tiger.c +++ b/src/tiger.c @@ -808,7 +808,7 @@ void tiger_sse2_chunk(const char *str1, const char *str2, t_word length, t_res r #endif } } -void tiger_sse2_last_chunk (const char *str1, const char *str2, t_word length, t_word reallength, t_res res1, t_res res2, char pad) +void tiger_sse2_last_chunk (const char *str1, const char *str2, t_word length, t_word reallength1, t_word reallength2, t_res res1, t_res res2, char pad) { t_word i; t_block tmp1; @@ -828,8 +828,8 @@ void tiger_sse2_last_chunk (const char *str1, const char *str2, t_word length, t } memset(uc(tmp1)+i,0,(size_t)(56-i)); memset(uc(tmp2)+i,0,(size_t)(56-i)); - tmp1[7]=reallength<<(t_word)3; - tmp2[7]=reallength<<(t_word)3; + tmp1[7]=reallength1<<(t_word)3; + tmp2[7]=reallength2<<(t_word)3; tiger_block_sse2(tmp1, tmp2, res1, res2); } @@ -844,7 +844,7 @@ void tiger_sse2(const char *str1, const char *str2, t_word length, t_res res1, t res2[2]=0xF096A5B4C3B2E187ULL; tiger_sse2_chunk(str1, str2, aligned_length, res1, res2); - tiger_sse2_last_chunk(str1 + aligned_length, str2 + aligned_length, length - aligned_length, length, res1, res2, pad); + tiger_sse2_last_chunk(str1 + aligned_length, str2 + aligned_length, length - aligned_length, length, length, res1, res2, pad); } #endif diff --git a/src/tiger.cpp b/src/tiger.cpp index ee46573..f822f81 100644 --- a/src/tiger.cpp +++ b/src/tiger.cpp @@ -21,11 +21,13 @@ #include #include #include +#include #include #if defined(__SSE2__) extern "C" void tiger_sse2_chunk ( const char* parStr1, const char* parStr2, uint64_t parLength, uint64_t parRes1[3], uint64_t parRes2[3] ); -extern "C" void tiger_sse2_last_chunk ( const char* parStr1, const char* parStr2, uint64_t parLength, uint64_t parRealLength, uint64_t parRes1[3], uint64_t parRes2[3], char pad ); +extern "C" void tiger_sse2_last_chunk ( const char* parStr1, const char* parStr2, uint64_t parLength, uint64_t parRealLength1, uint64_t parRealLength2, uint64_t parRes1[3], uint64_t parRes2[3], char parPadding ); +extern "C" void tiger ( const char* parStr, uint64_t parLength, uint64_t parHash[3], char parPadding ); #else # error "Not implemented without SSE2" @@ -51,6 +53,7 @@ namespace din { } void tiger_file (const std::string& parPath, TigerHash& parHashFile, TigerHash& parHashDir) { + typedef decltype(std::declval().tellg()) FileSizeType; tiger_init_hash(parHashFile); std::ifstream src(parPath, std::ios::binary); @@ -58,11 +61,22 @@ namespace din { const auto file_size = src.tellg(); src.seekg(0, std::ios_base::beg); - const uint32_t buffsize = static_cast(std::min(file_size, g_buff_size)); + const FileSizeType hash_size = (sizeof(TigerHash) + 63) & -64; + const uint32_t buffsize = static_cast(std::max(hash_size, std::min(file_size, g_buff_size))); std::unique_ptr buff(new char[63 + buffsize]); char* const buff_ptr = reinterpret_cast(reinterpret_cast(buff.get() + 63) & (-64)); assert(buff_ptr >= buff.get() and buff_ptr + buffsize <= buff.get() + 63 + buffsize); + //Use the initial value of the dir's hash as if it was part of the data to hash and start + //by processing that value. Hash is reset to the initial value before the call to tiger. + { + std::copy(parHashDir.byte_data, parHashDir.byte_data + sizeof(parHashDir), buff_ptr); + std::fill(buff_ptr + sizeof(parHashDir), buff_ptr + hash_size, 0); + TigerHash dummy = {}; + tiger_init_hash(parHashDir); + tiger_sse2_chunk(buff_ptr, buff_ptr, hash_size, dummy.data, parHashDir.data); + } + auto remaining = file_size; while (remaining > buffsize) { assert(buffsize >= sizeof(uint64_t) * 3); @@ -80,7 +94,9 @@ namespace din { tiger_sse2_chunk(buff_ptr, buff_ptr, aligned_size, parHashFile.data, parHashDir.data); } - tiger_sse2_last_chunk(buff_ptr + aligned_size, buff_ptr + aligned_size, remaining - aligned_size, file_size, parHashFile.data, parHashDir.data, g_tiger_padding); + //Remember to pass the augmented data size for the second reallength value: we passed the initial + //dir's hash value (64 bytes) as if they were part of the data. + tiger_sse2_last_chunk(buff_ptr + aligned_size, buff_ptr + aligned_size, remaining - aligned_size, file_size, file_size + hash_size, parHashFile.data, parHashDir.data, g_tiger_padding); } } @@ -89,4 +105,12 @@ namespace din { oss << std::hex << swap_long(parHash.part_a) << swap_long(parHash.part_b) << swap_long(parHash.part_c); return oss.str(); } + + void tiger_data (const std::string& parData, TigerHash& parHash) { + tiger (parData.data(), parData.size(), parHash.data, g_tiger_padding); + } + + void tiger_data (const std::vector& parData, TigerHash& parHash) { + tiger (parData.data(), parData.size(), parHash.data, g_tiger_padding); + } } //namespace din diff --git a/src/tiger.hpp b/src/tiger.hpp index 9bbb9d8..6b3d3dc 100644 --- a/src/tiger.hpp +++ b/src/tiger.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace din { struct TigerHash { @@ -41,6 +42,8 @@ namespace din { void tiger_file ( const std::string& parPath, TigerHash& parHashFile, TigerHash& parHashDir ); void tiger_init_hash ( TigerHash& parHash ); std::string tiger_to_string ( const TigerHash& parHash ); + void tiger_data ( const std::string& parData, TigerHash& parHash ); + void tiger_data ( const std::vector& parData, TigerHash& parHash ); } //namespace din #endif