Java源码示例:org.apache.parquet.column.page.PageReader

示例1
/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}
 
示例2
@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}
 
示例3
public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
  // setPageSource can result in a data page read. If that happens, we need
  // to know in advance whether all the pages in the row group are dictionary encoded or not
  this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded);
  super.setPageSource(store);
  return dictionary;
}
 
示例4
public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  BasePageIterator pageIterator = pageIterator();
  pageIterator.reset();
  dictionary = ParquetUtil.readDictionary(desc, pageSource);
  pageIterator.setDictionary(dictionary);
  advance();
}
 
示例5
public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
  DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
    }
  }
  return null;
}
 
示例6
public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  this.pageIterator.reset();
  this.pageIterator.setDictionary(readDictionary(desc, pageSource));
  advance();
}
 
示例7
private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
    DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
//        if (converter.hasDictionarySupport()) {
//          converter.setDictionary(dictionary);
//        }
      } catch (IOException e) {
        throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
      }
    }
    return null;
  }
 
示例8
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
    throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV1 page = (DataPageV1) pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}
 
示例9
public AbstractColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	this.descriptor = descriptor;
	this.pageReader = pageReader;
	this.maxDefLevel = descriptor.getMaxDefinitionLevel();

	DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
	if (dictionaryPage != null) {
		try {
			this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage);
			this.isCurrentPageDictionaryEncoded = true;
		} catch (IOException e) {
			throw new IOException("could not decode the dictionary for " + descriptor, e);
		}
	} else {
		this.dictionary = null;
		this.isCurrentPageDictionaryEncoded = false;
	}
	/*
	 * Total number of values in this column (in this row group).
	 */
	long totalValueCount = pageReader.getTotalValueCount();
	if (totalValueCount == 0) {
		throw new IOException("totalValueCount == 0");
	}
}
 
示例10
public FixedLenBytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader,
		int precision) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY);
	this.precision = precision;
}
 
示例11
public TimestampColumnReader(
		boolean utcTimestamp,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	this.utcTimestamp = utcTimestamp;
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT96);
}
 
示例12
@Override
public ColumnReader getColumnReader(ColumnDescriptor path) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  PageReader pageReader = pageReadStore.getPageReader(path);
  Optional<PrimitiveIterator.OfLong> rowIndexes = pageReadStore.getRowIndexes();
  if (rowIndexes.isPresent()) {
    return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get());
  } else {
    return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
  }
}
 
示例13
SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter,
    ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) {
  super(path, pageReader, converter, writerVersion);
  this.rowIndexes = rowIndexes;
  targetRow = Long.MIN_VALUE;
  consume();
}
 
示例14
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}
 
示例15
@Override
public PageReader getPageReader(ColumnDescriptor path) {
  final PageReader pageReader = readers.get(path);
  if (pageReader == null) {
    throw new IllegalArgumentException(path + " is not in the store: " + readers.keySet() + " " + rowCount);
  }
  return pageReader;
}
 
示例16
private void validateV2Page(MessageType schema, PageReadStore pages, String[] path, int values, int rows, int nullCount,
                            byte[] repetition, byte[] definition, byte[] data, int uncompressedSize) throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV2 page = (DataPageV2)pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertEquals(rows, page.getRowCount());
  assertEquals(nullCount, page.getNullCount());
  assertEquals(uncompressedSize, page.getUncompressedSize());
  assertArrayEquals(repetition, page.getRepetitionLevels().toByteArray());
  assertArrayEquals(definition, page.getDefinitionLevels().toByteArray());
  assertArrayEquals(data, page.getData().toByteArray());
}
 
示例17
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}
 
示例18
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
示例19
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}
 
示例20
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}
 
示例21
public ByteColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
示例22
public BooleanColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BOOLEAN);
}
 
示例23
public LongColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT64);
}
 
示例24
public ShortColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
示例25
public DoubleColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.DOUBLE);
}
 
示例26
public IntColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
示例27
public BytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BINARY);
}
 
示例28
public FloatColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FLOAT);
}
 
示例29
public static ColumnReader createColumnReader(
		boolean utcTimestamp,
		LogicalType fieldType,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	switch (fieldType.getTypeRoot()) {
		case BOOLEAN:
			return new BooleanColumnReader(descriptor, pageReader);
		case TINYINT:
			return new ByteColumnReader(descriptor, pageReader);
		case DOUBLE:
			return new DoubleColumnReader(descriptor, pageReader);
		case FLOAT:
			return new FloatColumnReader(descriptor, pageReader);
		case INTEGER:
		case DATE:
		case TIME_WITHOUT_TIME_ZONE:
			return new IntColumnReader(descriptor, pageReader);
		case BIGINT:
			return new LongColumnReader(descriptor, pageReader);
		case SMALLINT:
			return new ShortColumnReader(descriptor, pageReader);
		case CHAR:
		case VARCHAR:
		case BINARY:
		case VARBINARY:
			return new BytesColumnReader(descriptor, pageReader);
		case TIMESTAMP_WITHOUT_TIME_ZONE:
		case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
			return new TimestampColumnReader(utcTimestamp, descriptor, pageReader);
		case DECIMAL:
			switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) {
				case INT32:
					return new IntColumnReader(descriptor, pageReader);
				case INT64:
					return new LongColumnReader(descriptor, pageReader);
				case BINARY:
					return new BytesColumnReader(descriptor, pageReader);
				case FIXED_LEN_BYTE_ARRAY:
					return new FixedLenBytesColumnReader(
							descriptor, pageReader, ((DecimalType) fieldType).getPrecision());
			}
		default:
			throw new UnsupportedOperationException(fieldType + " is not supported now.");
	}
}
 
示例30
private ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
}