176 lines
5.4 KiB
C++
176 lines
5.4 KiB
C++
// Copyright 2013 Google Inc. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//
|
|
// Author: dsites@google.com (Dick Sites)
|
|
//
|
|
|
|
#ifndef UTIL_UTF8_OFFSETMAP_H_
|
|
#define UTIL_UTF8_OFFSETMAP_H_
|
|
|
|
#include <string> // for string
|
|
#include "integral_types.h" // for uint32
|
|
|
|
// ***************************** OffsetMap **************************
|
|
//
|
|
// An OffsetMap object is a container for a mapping from offsets in one text
|
|
// buffer A' to offsets in another text buffer A. It is most useful when A' is
|
|
// built from A via substitutions that occasionally do not preserve byte length.
|
|
//
|
|
// A series of operators are used to build the correspondence map, then
|
|
// calls can be made to map an offset in A' to an offset in A, or vice versa.
|
|
// The map starts with offset 0 in A corresponding to offset 0 in A'.
|
|
// The mapping is then built sequentially, adding on byte ranges that are
|
|
// identical in A and A', byte ranges that are inserted in A', and byte ranges
|
|
// that are deleted from A. All bytes beyond those specified when building the
|
|
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
|
|
// end of the map.
|
|
//
|
|
// The internal data structure records positions at which bytes are added or
|
|
// deleted. Using the map is O(1) when increasing the A' or A offset
|
|
// monotonically, and O(n) when accessing random offsets, where n is the
|
|
// number of differences.
|
|
//
|
|
|
|
namespace CLD2 {
|
|
|
|
class OffsetMap {
|
|
public:
|
|
// Constructor, destructor
|
|
OffsetMap();
|
|
~OffsetMap();
|
|
|
|
// Clear the map
|
|
void Clear();
|
|
|
|
// Add to mapping from A to A', specifying how many next bytes correspond
|
|
// in A and A'
|
|
void Copy(int bytes);
|
|
|
|
// Add to mapping from A to A', specifying how many next bytes are
|
|
// inserted in A' while not advancing in A at all
|
|
void Insert(int bytes);
|
|
|
|
// Add to mapping from A to A', specifying how many next bytes are
|
|
// deleted from A while not advancing in A' at all
|
|
void Delete(int bytes);
|
|
|
|
// Print map to file, for debugging
|
|
void Printmap(const char* filename);
|
|
|
|
// [Finish building map,] Re-position to offset 0
|
|
// This call is optional; MapForward and MapBack finish building the map
|
|
// if necessary
|
|
void Reset();
|
|
|
|
// Map an offset in A' to the corresponding offset in A
|
|
int MapBack(int aprimeoffset);
|
|
|
|
// Map an offset in A to the corresponding offset in A'
|
|
int MapForward(int aoffset);
|
|
|
|
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
|
|
// from A' to A'' and h is from A to A''.
|
|
//
|
|
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
|
|
// to h->MoveForward(aoffset), while
|
|
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
|
|
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
|
|
// f and insertion in g are at the same place. For example,
|
|
//
|
|
// A 1 2 3 4
|
|
// ^ | ^ ^
|
|
// | | / | f
|
|
// v vv v
|
|
// A' 1' 2' 3'
|
|
// ^ ^^ ^
|
|
// | | \ | g
|
|
// v | v v
|
|
// A'' 1'' 2'' 3'' 4''
|
|
//
|
|
// results in:
|
|
//
|
|
// A 1 2 3 4
|
|
// ^ ^\ ^ ^
|
|
// | | \ | | h
|
|
// v | vv v
|
|
// A'' 1'' 2'' 3'' 4''
|
|
//
|
|
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
|
|
// the latter figure.
|
|
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
|
|
|
|
// For debugging only; writes to stderr
|
|
void DumpWindow();
|
|
|
|
// For testing only -- force a mapping
|
|
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
|
|
|
|
private:
|
|
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
|
|
|
|
void Flush();
|
|
void FlushAll();
|
|
void MaybeFlushAll();
|
|
void Emit(MapOp op, int len);
|
|
|
|
void SetLeft();
|
|
void SetRight();
|
|
|
|
// Back up over previous range, 1..5 bytes
|
|
// Return subscript at the beginning of that. Pins at 0
|
|
int Backup(int sub);
|
|
|
|
// Parse next range, 1..5 bytes
|
|
// Return subscript just off the end of that
|
|
int ParseNext(int sub, MapOp* op, int* length);
|
|
|
|
// Parse previous range, 1..5 bytes
|
|
// Return current subscript
|
|
int ParsePrevious(int sub, MapOp* op, int* length);
|
|
|
|
void PrintPosition(const char* str);
|
|
|
|
bool MoveRight(); // Returns true if OK
|
|
bool MoveLeft(); // Returns true if OK
|
|
void DumpString();
|
|
|
|
// Copies insert operations from source to dest. Returns true if no
|
|
// other operations are found.
|
|
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
|
|
|
|
// Copies delete operations from source to dest. Returns true if no other
|
|
// operations are found.
|
|
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
|
|
|
|
std::string diffs_;
|
|
MapOp pending_op_;
|
|
uint32 pending_length_;
|
|
|
|
// Offsets in the ranges below correspond to each other, with A' = A + diff
|
|
int next_diff_sub_;
|
|
int current_lo_aoffset_;
|
|
int current_hi_aoffset_;
|
|
int current_lo_aprimeoffset_;
|
|
int current_hi_aprimeoffset_;
|
|
int current_diff_;
|
|
int max_aoffset_;
|
|
int max_aprimeoffset_;
|
|
};
|
|
|
|
} // namespace CLD2
|
|
|
|
#endif // UTIL_UTF8_OFFSETMAP_H_
|
|
|