ICU 52.1  52.1
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
59 
60 struct Regex8BitSet;
61 class RegexCImpl;
62 class RegexMatcher;
63 class RegexPattern;
64 struct REStackFrame;
65 class RuleBasedBreakIterator;
66 class UnicodeSet;
67 class UVector;
68 class UVector32;
69 class UVector64;
70 
71 #ifndef U_HIDE_INTERNAL_API
72 
76 #ifdef REGEX_DEBUG
77 U_INTERNAL void U_EXPORT2
78  RegexPatternDump(const RegexPattern *pat);
79 #else
80  #undef RegexPatternDump
81  #define RegexPatternDump(pat)
82 #endif
83 #endif /* U_HIDE_INTERNAL_API */
84 
85 
86 
99 public:
100 
108  RegexPattern();
109 
116  RegexPattern(const RegexPattern &source);
117 
123  virtual ~RegexPattern();
124 
133  UBool operator==(const RegexPattern& that) const;
134 
143  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
144 
150  RegexPattern &operator =(const RegexPattern &source);
151 
159  virtual RegexPattern *clone() const;
160 
161 
186  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
187  UParseError &pe,
188  UErrorCode &status);
189 
216  static RegexPattern * U_EXPORT2 compile( UText *regex,
217  UParseError &pe,
218  UErrorCode &status);
219 
244  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
245  uint32_t flags,
246  UParseError &pe,
247  UErrorCode &status);
248 
275  static RegexPattern * U_EXPORT2 compile( UText *regex,
276  uint32_t flags,
277  UParseError &pe,
278  UErrorCode &status);
279 
302  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
303  uint32_t flags,
304  UErrorCode &status);
305 
330  static RegexPattern * U_EXPORT2 compile( UText *regex,
331  uint32_t flags,
332  UErrorCode &status);
333 
339  virtual uint32_t flags() const;
340 
358  virtual RegexMatcher *matcher(const UnicodeString &input,
359  UErrorCode &status) const;
360 
361 private:
374  RegexMatcher *matcher(const UChar *input,
375  UErrorCode &status) const;
376 public:
377 
378 
390  virtual RegexMatcher *matcher(UErrorCode &status) const;
391 
392 
407  static UBool U_EXPORT2 matches(const UnicodeString &regex,
408  const UnicodeString &input,
409  UParseError &pe,
410  UErrorCode &status);
411 
426  static UBool U_EXPORT2 matches(UText *regex,
427  UText *input,
428  UParseError &pe,
429  UErrorCode &status);
430 
439  virtual UnicodeString pattern() const;
440 
441 
452  virtual UText *patternText(UErrorCode &status) const;
453 
454 
493  virtual int32_t split(const UnicodeString &input,
494  UnicodeString dest[],
495  int32_t destCapacity,
496  UErrorCode &status) const;
497 
498 
537  virtual int32_t split(UText *input,
538  UText *dest[],
539  int32_t destCapacity,
540  UErrorCode &status) const;
541 
542 
548  virtual UClassID getDynamicClassID() const;
549 
555  static UClassID U_EXPORT2 getStaticClassID();
556 
557 private:
558  //
559  // Implementation Data
560  //
561  UText *fPattern; // The original pattern string.
562  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
563  uint32_t fFlags; // The flags used when compiling the pattern.
564  //
565  UVector64 *fCompiledPat; // The compiled pattern p-code.
566  UnicodeString fLiteralText; // Any literal string data from the pattern,
567  // after un-escaping, for use during the match.
568 
569  UVector *fSets; // Any UnicodeSets referenced from the pattern.
570  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
571 
572 
573  UErrorCode fDeferredStatus; // status if some prior error has left this
574  // RegexPattern in an unusable state.
575 
576  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
577  // >= this value. For some patterns, this calculated
578  // value may be less than the true shortest
579  // possible match.
580 
581  int32_t fFrameSize; // Size of a state stack frame in the
582  // execution engine.
583 
584  int32_t fDataSize; // The size of the data needed by the pattern that
585  // does not go on the state stack, but has just
586  // a single copy per matcher.
587 
588  UVector32 *fGroupMap; // Map from capture group number to position of
589  // the group's variables in the matcher stack frame.
590 
591  int32_t fMaxCaptureDigits;
592 
593  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
594  // regex character classes, e.g. Word.
595 
596  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
597  // sets for predefined regex classes.
598 
599  int32_t fStartType; // Info on how a match must start.
600  int32_t fInitialStringIdx; //
601  int32_t fInitialStringLen;
602  UnicodeSet *fInitialChars;
603  UChar32 fInitialChar;
604  Regex8BitSet *fInitialChars8;
605  UBool fNeedsAltInput;
606 
607  friend class RegexCompile;
608  friend class RegexMatcher;
609  friend class RegexCImpl;
610 
611  //
612  // Implementation Methods
613  //
614  void init(); // Common initialization, for use by constructors.
615  void zap(); // Common cleanup
616 #ifdef REGEX_DEBUG
617  void dumpOp(int32_t index) const;
618  friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
619 #endif
620 
621 };
622 
623 
624 
635 public:
636 
651  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
652 
668  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
669 
691  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
692  uint32_t flags, UErrorCode &status);
693 
715  RegexMatcher(UText *regexp, UText *input,
716  uint32_t flags, UErrorCode &status);
717 
718 private:
731  RegexMatcher(const UnicodeString &regexp, const UChar *input,
732  uint32_t flags, UErrorCode &status);
733 public:
734 
735 
741  virtual ~RegexMatcher();
742 
743 
750  virtual UBool matches(UErrorCode &status);
751 
752 
763  virtual UBool matches(int64_t startIndex, UErrorCode &status);
764 
765 
779  virtual UBool lookingAt(UErrorCode &status);
780 
781 
795  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
796 
797 
810  virtual UBool find();
811 
812 
822  virtual UBool find(int64_t start, UErrorCode &status);
823 
824 
834  virtual UnicodeString group(UErrorCode &status) const;
835 
836 
849  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
850 
851 
857  virtual int32_t groupCount() const;
858 
859 
874  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
875 
891  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
892 
908  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
909 
910 
918  virtual int32_t start(UErrorCode &status) const;
919 
927  virtual int64_t start64(UErrorCode &status) const;
928 
929 
943  virtual int32_t start(int32_t group, UErrorCode &status) const;
944 
958  virtual int64_t start64(int32_t group, UErrorCode &status) const;
959 
960 
974  virtual int32_t end(UErrorCode &status) const;
975 
989  virtual int64_t end64(UErrorCode &status) const;
990 
991 
1009  virtual int32_t end(int32_t group, UErrorCode &status) const;
1010 
1028  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1029 
1030 
1039  virtual RegexMatcher &reset();
1040 
1041 
1057  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1058 
1059 
1077  virtual RegexMatcher &reset(const UnicodeString &input);
1078 
1079 
1093  virtual RegexMatcher &reset(UText *input);
1094 
1095 
1120  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1121 
1122 private:
1135  RegexMatcher &reset(const UChar *input);
1136 public:
1137 
1145  virtual const UnicodeString &input() const;
1146 
1155  virtual UText *inputText() const;
1156 
1167  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1168 
1169 
1188  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1189 
1201  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1202 
1211  virtual int32_t regionStart() const;
1212 
1221  virtual int64_t regionStart64() const;
1222 
1223 
1232  virtual int32_t regionEnd() const;
1233 
1242  virtual int64_t regionEnd64() const;
1243 
1252  virtual UBool hasTransparentBounds() const;
1253 
1272  virtual RegexMatcher &useTransparentBounds(UBool b);
1273 
1274 
1282  virtual UBool hasAnchoringBounds() const;
1283 
1284 
1297  virtual RegexMatcher &useAnchoringBounds(UBool b);
1298 
1299 
1312  virtual UBool hitEnd() const;
1313 
1323  virtual UBool requireEnd() const;
1324 
1325 
1331  virtual const RegexPattern &pattern() const;
1332 
1333 
1350  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1351 
1352 
1373  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1374 
1375 
1396  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1397 
1398 
1423  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1424 
1425 
1453  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1454  const UnicodeString &replacement, UErrorCode &status);
1455 
1456 
1484  virtual RegexMatcher &appendReplacement(UText *dest,
1485  UText *replacement, UErrorCode &status);
1486 
1487 
1498  virtual UnicodeString &appendTail(UnicodeString &dest);
1499 
1500 
1514  virtual UText *appendTail(UText *dest, UErrorCode &status);
1515 
1516 
1540  virtual int32_t split(const UnicodeString &input,
1541  UnicodeString dest[],
1542  int32_t destCapacity,
1543  UErrorCode &status);
1544 
1545 
1569  virtual int32_t split(UText *input,
1570  UText *dest[],
1571  int32_t destCapacity,
1572  UErrorCode &status);
1573 
1595  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1596 
1603  virtual int32_t getTimeLimit() const;
1604 
1626  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1627 
1635  virtual int32_t getStackLimit() const;
1636 
1637 
1651  virtual void setMatchCallback(URegexMatchCallback *callback,
1652  const void *context,
1653  UErrorCode &status);
1654 
1655 
1666  virtual void getMatchCallback(URegexMatchCallback *&callback,
1667  const void *&context,
1668  UErrorCode &status);
1669 
1670 
1684  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1685  const void *context,
1686  UErrorCode &status);
1687 
1688 
1699  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1700  const void *&context,
1701  UErrorCode &status);
1702 
1703 #ifndef U_HIDE_INTERNAL_API
1704 
1709  void setTrace(UBool state);
1710 #endif /* U_HIDE_INTERNAL_API */
1711 
1717  static UClassID U_EXPORT2 getStaticClassID();
1718 
1724  virtual UClassID getDynamicClassID() const;
1725 
1726 private:
1727  // Constructors and other object boilerplate are private.
1728  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1729  RegexMatcher(); // default constructor not implemented
1730  RegexMatcher(const RegexPattern *pat);
1731  RegexMatcher(const RegexMatcher &other);
1732  RegexMatcher &operator =(const RegexMatcher &rhs);
1733  void init(UErrorCode &status); // Common initialization
1734  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1735 
1736  friend class RegexPattern;
1737  friend class RegexCImpl;
1738 public:
1739 #ifndef U_HIDE_INTERNAL_API
1740 
1741  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1742 #endif /* U_HIDE_INTERNAL_API */
1743 private:
1744 
1745  //
1746  // MatchAt This is the internal interface to the match engine itself.
1747  // Match status comes back in matcher member variables.
1748  //
1749  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1750  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1751  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1752  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1753  REStackFrame *resetStack();
1754  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1755  void IncrementTime(UErrorCode &status);
1756  UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
1757 
1758  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1759 
1760  UBool findUsingChunk();
1761  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1762  UBool isChunkWordBoundary(int32_t pos);
1763 
1764  const RegexPattern *fPattern;
1765  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1766  // should delete it when through.
1767 
1768  const UnicodeString *fInput; // The string being matched. Only used for input()
1769  UText *fInputText; // The text being matched. Is never NULL.
1770  UText *fAltInputText; // A shallow copy of the text being matched.
1771  // Only created if the pattern contains backreferences.
1772  int64_t fInputLength; // Full length of the input text.
1773  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1774 
1775  int64_t fRegionStart; // Start of the input region, default = 0.
1776  int64_t fRegionLimit; // End of input region, default to input.length.
1777 
1778  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1779  int64_t fAnchorLimit; // See useAnchoringBounds
1780 
1781  int64_t fLookStart; // Region bounds for look-ahead/behind and
1782  int64_t fLookLimit; // and other boundary tests. See
1783  // useTransparentBounds
1784 
1785  int64_t fActiveStart; // Currently active bounds for matching.
1786  int64_t fActiveLimit; // Usually is the same as region, but
1787  // is changed to fLookStart/Limit when
1788  // entering look around regions.
1789 
1790  UBool fTransparentBounds; // True if using transparent bounds.
1791  UBool fAnchoringBounds; // True if using anchoring bounds.
1792 
1793  UBool fMatch; // True if the last attempted match was successful.
1794  int64_t fMatchStart; // Position of the start of the most recent match
1795  int64_t fMatchEnd; // First position after the end of the most recent match
1796  // Zero if no previous match, even when a region
1797  // is active.
1798  int64_t fLastMatchEnd; // First position after the end of the previous match,
1799  // or -1 if there was no previous match.
1800  int64_t fAppendPosition; // First position after the end of the previous
1801  // appendReplacement(). As described by the
1802  // JavaDoc for Java Matcher, where it is called
1803  // "append position"
1804  UBool fHitEnd; // True if the last match touched the end of input.
1805  UBool fRequireEnd; // True if the last match required end-of-input
1806  // (matched $ or Z)
1807 
1808  UVector64 *fStack;
1809  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1810  // which will contain the capture group results.
1811  // NOT valid while match engine is running.
1812 
1813  int64_t *fData; // Data area for use by the compiled pattern.
1814  int64_t fSmallData[8]; // Use this for data if it's enough.
1815 
1816  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1817  // match engine run. Zero for unlimited.
1818 
1819  int32_t fTime; // Match time, accumulates while matching.
1820  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1821  // Kept separately from fTime to keep as much
1822  // code as possible out of the inline
1823  // StateSave function.
1824 
1825  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1826  // stack, in bytes. Zero for unlimited.
1827 
1828  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1829  // NULL if there is no callback.
1830  const void *fCallbackContext; // User Context ptr for callback function.
1831 
1832  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1833  // NULL if there is no callback.
1834  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1835 
1836 
1837  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1838 
1839  UBool fTraceDebug; // Set true for debug tracing of match engine.
1840 
1841  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1842  // reported, or that permanently disables this matcher.
1843 
1844  RuleBasedBreakIterator *fWordBreakItr;
1845 };
1846 
1848 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1849 #endif